In [104]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from collections import Counter
import re
from part_of_speech import get_part_of_speech
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans 
import numpy as np




In [47]:
df = pd.read_csv('../okcupiddata/profiles.csv')
df.columns

Index(['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7',
       'essay8', 'essay9', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'orientation', 'pets',
       'religion', 'sex', 'sign', 'smokes', 'speaks', 'status'],
      dtype='object')

In [76]:
#make dataframe just containing essays
df_essays = df[['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']]
df_essays['corpus'] = df_essays[df_essays.columns[0:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1)
corpus = df_essays['corpus'].tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_essays['corpus'] = df_essays[df_essays.columns[0:]].apply(


In [94]:
def remove_noise(text):
    #remove html tags
    import re
    clean = re.compile('<.*?>')
    text = re.sub(clean, '', text)
    #remove punctuation
    text = re.sub(r'[^\w\s]','',text)
    #remove newline, tab
    text = re.sub(r'\r+|\n+|\t+',' ',text)
    return text

def tokenize_lemmatize_remove_stop(text):
    tokenized = word_tokenize(str(text))
    #print(len(tokenized))
    lemmatized = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized]
    no_stop = [word for word in lemmatized if word not in stop_words]
    return no_stop

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


In [95]:
denoised = [remove_noise(str(i)) for i in corpus]
normalized = [tokenize_lemmatize_remove_stop(i) for i in denoised]
print(normalized[0])

['would', 'love', 'think', 'kind', 'intellectual', 'either', 'dumbest', 'smart', 'guy', 'smart', 'dumb', 'guy', 'cant', 'say', 'tell', 'difference', 'love', 'talk', 'idea', 'concept', 'forge', 'odd', 'metaphor', 'instead', 'recite', 'cliche', 'like', 'simularities', 'friend', 'mine', 'house', 'underwater', 'salt', 'mine', 'favorite', 'word', 'salt', 'way', 'weird', 'choice', 'know', 'thing', 'life', 'well', 'metaphor', 'seek', 'make', 'little', 'well', 'everyday', 'productively', 'lazy', 'way', 'get', 'tire', 'tie', 'shoe', 'consider', 'hire', 'five', 'year', 'old', 'would', 'probably', 'tie', 'shoe', 'decide', 'wear', 'leather', 'shoe', 'dress', 'shoe', 'love', 'really', 'serious', 'really', 'deep', 'conversation', 'really', 'silly', 'stuff', 'snap', 'light', 'hearted', 'rant', 'kiss', 'dont', 'funny', 'able', 'make', 'laugh', 'able', 'bend', 'spoon', 'mind', 'telepathically', 'make', 'smile', 'still', 'work', 'love', 'life', 'cool', 'let', 'wind', 'blow', 'extra', 'point', 'read', 'g

In [102]:
#initialize and fit TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(denoised)

tfidf_norm = normalize(tfidf)
tfidf_array = tfidf_norm.toarray()

tfidf_df = pd.DataFrame(tfidf_array, columns = vectorizer.get_feature_names())

    00  000  000000001  000000002  000000bubbles  00001  000you  001  002  \
0  0.0  0.0        0.0        0.0            0.0    0.0     0.0  0.0  0.0   
1  0.0  0.0        0.0        0.0            0.0    0.0     0.0  0.0  0.0   
2  0.0  0.0        0.0        0.0            0.0    0.0     0.0  0.0  0.0   
3  0.0  0.0        0.0        0.0            0.0    0.0     0.0  0.0  0.0   
4  0.0  0.0        0.0        0.0            0.0    0.0     0.0  0.0  0.0   

   003  ...  zzzzzz  zzzzzzz  zzzzzzzits  zzzzzzzz  zzzzzzzzz  zzzzzzzzzz  \
0  0.0  ...     0.0      0.0         0.0       0.0        0.0         0.0   
1  0.0  ...     0.0      0.0         0.0       0.0        0.0         0.0   
2  0.0  ...     0.0      0.0         0.0       0.0        0.0         0.0   
3  0.0  ...     0.0      0.0         0.0       0.0        0.0         0.0   
4  0.0  ...     0.0      0.0         0.0       0.0        0.0         0.0   

   zzzzzzzzzzingmy  zzzzzzzzzzs  zzzzzzzzzzzzshow  zzzzzzzzzzzzzzzzdigital

In [105]:
class Kmeans:
    """ K Means Clustering
    
    Parameters
    -----------
        k: int , number of clusters
        
        seed: int, will be randomly set if None
        
        max_iter: int, number of iterations to run algorithm, default: 200
        
    Attributes
    -----------
       centroids: array, k, number_features
       
       cluster_labels: label for each data point
       
    """
    
    def __init__(self, k, seed = None, max_iter = 200):
        self.k = k
        self.seed = seed
        if self.seed is not None:
            np.random.seed(self.seed)
        self.max_iter = max_iter
        
            
    
    def initialise_centroids(self, data):
        """Randomly Initialise Centroids
        
        Parameters
        ----------
        data: array or matrix, number_rows, number_features
        
        Returns
        --------
        centroids: array of k centroids chosen as random data points 
        """
        
        initial_centroids = np.random.permutation(data.shape[0])[:self.k]
        self.centroids = data[initial_centroids]

        return self.centroids
    
    
    def assign_clusters(self, data):
        """Compute distance of data from clusters and assign data point
           to closest cluster.
        
        Parameters
        ----------
        data: array or matrix, number_rows, number_features
        
        Returns
        --------
        cluster_labels: index which minmises the distance of data to each
        cluster
            
        """
        
        if data.ndim == 1:
            data = data.reshape(-1, 1)
        
        dist_to_centroid =  pairwise_distances(data, self.centroids, metric = 'euclidean')
        self.cluster_labels = np.argmin(dist_to_centroid, axis = 1)
        
        return  self.cluster_labels
    
    
    def update_centroids(self, data):
        """Computes average of all data points in cluster and
           assigns new centroids as average of data points
        
        Parameters
        -----------
        data: array or matrix, number_rows, number_features
        
        Returns
        -----------
        centroids: array, k, number_features
        """
        
        self.centroids = np.array([data[self.cluster_labels == i].mean(axis = 0) for i in range(self.k)])
        
        return self.centroids
    
    
    
    def predict(self, data):
        """Predict which cluster data point belongs to
        
        Parameters
        ----------
        data: array or matrix, number_rows, number_features
        
        Returns
        --------
        cluster_labels: index which minmises the distance of data to each
        cluster
        """
        
        return self.assign_clusters(data)
    
    def fit_kmeans(self, data):
        """
        This function contains the main loop to fit the algorithm
        Implements initialise centroids and update_centroids
        according to max_iter
        -----------------------
        
        Returns
        -------
        instance of kmeans class
            
        """
        self.centroids = self.initialise_centroids(data)
        
        # Main kmeans loop
        for iter in range(self.max_iter):

            self.cluster_labels = self.assign_clusters(data)
            self.centroids = self.update_centroids(data)          
            if iter % 100 == 0:
                print("Running Model Iteration %d " %iter)
        print("Model finished running")
        return self   

In [106]:
number_clusters = range(1, 7)

kmeans = [KMeans(n_clusters=i, max_iter = 600) for i in number_clusters]
kmeans

score = [kmeans[i].fit(Y_sklearn).score(Y_sklearn) for i in range(len(kmeans))]
score

plt.plot(number_clusters, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Method')
plt.show()

NameError: name 'Y_sklearn' is not defined