### Necessary imports

In [None]:
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here

In [None]:
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import re
import pandas as pd

In [None]:
!#python3 -m spacy download en

In [None]:
#!python -m spacy download en_core_web_sm

In [None]:
spacy.load('en_core_web_sm')

### Load data

In [None]:
data = pd.read_excel('Masterdaten1_BMW.xlsx')
data1=data.drop(['Level2', 'Level1'], axis = 1)
data1

In [None]:
print(data1.dtypes)

In [None]:
rating = data['rating']

In [None]:
data1['clean_sentence']

### Data cleaning (with simple process from gensim)

In [None]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

# Convert to list
data = data.clean_sentence.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:3])

### bigram and trigram models

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!

In [None]:
type(data_ready)

### Dictionary

In [None]:
id2word = corpora.Dictionary(data_ready)
id2word.filter_extremes(no_below=40,no_above=0.6)

In [None]:
type(id2word)

### Corpus

In [None]:
# Create Corpus: Term Document Frequency
corpus_doc2bow = [id2word.doc2bow(text) for text in data_ready]

In [None]:
#type(corpus_doc2bow)

In [None]:
#corpus_doc2bow[0]

In [None]:
tfidf = gensim.models.TfidfModel(corpus_doc2bow)
corpus_tfidf = tfidf[corpus_doc2bow]

In [None]:
#id2word[0]

In [None]:
#corpus_tfidf[0]

In [None]:
#type(corpus_tfidf)

### LDA Model

In [None]:
# Build LDA model
lda_model_tfidf = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10000,
                                           passes=10,
                                           alpha='symmetric',
                                           eta='auto',
                                           iterations=50,
                                           per_word_topics=True)

#print(lda_model.print_topics())

In [None]:
lda_model_tfidf.show_topics(num_words=150)

In [None]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, id2word, sort_topics=False)

In [None]:
p = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, id2word)
pyLDAvis.save_html(p, 'finalo6topicsfinal2.html')

In [None]:
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model_tfidf, texts=data_ready, dictionary=id2word, window_size=60, coherence='c_v')

# Calculate and print coherence
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)

______

## Evaluation

In [None]:
import gensim
from gensim.models import CoherenceModel

CoherenceModel(lda_model_tfidf, texts=data_ready, dictionary=id2word, window_size=60).get_coherence()

In [None]:
# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=data_ready, dictionary=id2word, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=data_ready, dictionary=id2word, coherence="c_uci")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, window_size, start=2, step=1):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values1 = []
    model_list1 = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, window_size=window_size, coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list1, coherence_values1 = compute_coherence_values(dictionary=id2word, corpus=corpus_tfidf, texts=data_ready, limit=20, window_size=60, start=2, step=1)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, window_size, start=2, step=1):

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        #model = gensim.models.CoherenceModel(model=lda_model_tfidf, texts=data_ready, dictionary=id2word, window_size=60, coherence='c_v')
        #model=gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, window_size=60)
        
        model_list.append(model)
        
        #coherence_model_lda = gensim.models.CoherenceModel(model=lda_model_tfidf, texts=data_ready, dictionary=id2word, window_size=60, coherence='c_v')
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, window_size=window_size)
        
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus_tfidf, texts=data_ready, limit=20, window_size=60, start=2, step=1)

In [None]:
import matplotlib.pyplot as plt
limit=20; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.title("Coherence score tfidf")
plt.legend(("coherence_values"), loc='best')
plt.show()
#seem

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Coherence values for varying alpha
def compute_coherence_values_ALPHA(corpus, dictionary, num_topics, texts, start, limit, step):
    coherence_values = []
    model_list = []
    for alpha in range(start, limit, step):
        model = gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=seed, alpha=alpha/10, passes=100)
        model_list.append(model)
        coherencemodel = gensim.models.CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

model_list, coherence_values = compute_coherence_values_ALPHA(dictionary=id2word, corpus=corpus_tfidf, num_topics=num_topics, texts=data_ready, start=1, limit=10, step=1)

# Plot graph of coherence values by varying alpha
limit=10; start=1; step=1;
x_axis = []
for x in range(start, limit, step):
    x_axis.append(x/10)
plt.plot(x_axis, coherence_values)
plt.xlabel("Alpha")
plt.ylabel("Coherence score")
plt.legend(("coherence"), loc='best')
plt.show()

In [None]:
import numpy as np
import tqdm

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus_tfidf,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=10000,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=data_ready, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 4
max_topics = 20
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus_tfidf)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus_tfidf, num_of_docs*0.75), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

In [None]:
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    #iterate through validation corpuses
    for i in range(len(corpus_sets)):
         #iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                 iterare through beta values
                for b in beta:
                     get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                #  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

_____

## Wordclouds

In [None]:
# 1. Wordcloud of Top N words in each topic
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model_tfidf.show_topics(formatted=False)

fig, axes = plt.subplots(3, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

_____

## Overview

In [None]:
all_topics = lda_model_tfidf.get_document_topics(corpus_tfidf, minimum_probability=0)
all_topics[0]

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus_tfidf, texts=data,ratings=data, documents=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    
      # Add original text to the end of the output
    contents = pd.Series(texts)
    rating = pd.Series(ratings)
    docs = pd.Series(documents)
    sent_topics_df = pd.concat([sent_topics_df, contents, rating, documents], axis=1)
    return(sent_topics_df)  

In [None]:
data1['clean_sentence']

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model_tfidf, corpus=corpus_tfidf, texts=data_ready, ratings=data1['rating'], documents = data1['clean_sentence'])

In [None]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text','Rating','Docs']
#df_dominant_topic.head(10)
df_dominant_topic

In [None]:
df_dominant_topic.to_excel(r'/Users/franziskascharpf/Desktop/Finalcode/\Overviewtopics9_Masterdata.xlsx', index = False)

____

## Embedded projector

In [None]:
# create file for tensors(vectors)
with open('wcc35LW_lda_tensor.tsv','w') as w:
    for doc_topics in all_topics:
        for topics in doc_topics:
            w.write(str(topics[1])+ "\t")
        w.write("\n")

In [None]:
# create file for metadata(documet titles)
import io
import numpy as np
with io.open('wcc35LW_lda_metadata.tsv','w', encoding="utf-8") as w:
    for doc_id in range(len(all_topics)):
        w.write(df_dominant_topic.Dominant_Topic[doc_id].astype(np.str)+"\t" + data.clean_sentence[doc_id] +"\n")

In [None]:
# create file for metadata(documet titles)
with open('doc_lda_metadata.tsv','w') as w:
    for doc_id in range(len(all_topics)):
        w.write(data.clean_sentence[doc_id] + "\n")

______

## Heatmap

In [None]:
import plotly.offline as py
from plotly.graph_objs import *
import plotly.figure_factory as ff

py.init_notebook_mode()

def plot_difference(mdiff, title="", annotation=None):
    """
    Helper function to plot difference between models
    """
    annotation_html = None
    if annotation is not None:
        annotation_html = [["+++ {}<br>--- {}".format(", ".join(int_tokens), ", ".join(diff_tokens))
                            for (int_tokens, diff_tokens) in row]
                           for row in annotation]
        
    data = Heatmap(z=mdiff, colorscale='RdBu', text=annotation_html)
    layout = Layout(width=950, height=950, title=title,
                       xaxis=dict(title="topic"), yaxis=dict(title="topic"))
    py.iplot(dict(data=[data], layout=layout))

In [None]:
difference_matrix, annotation = lda_model_tfidf.diff(lda_model_tfidf, distance='jensen_shannon', num_words=50)
plot_difference(difference_matrix, title="Topic difference [jensen shannon distance]", annotation=annotation)

In [None]:
doc_number = 0
doc_topic, word_topic, phi_value = lda_model_tfidf.get_document_topics(corpus_tfidf[doc_number], per_word_topics=True)

In [None]:
doc_topic

In [None]:
word_topic

In [None]:
phi_value

____

# Clustering K-Means

In [None]:
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here

In [None]:
from sklearn.decomposition import PCA

In [None]:
#data_ready

In [None]:
data1

In [None]:
print(data1.dtypes)

In [None]:
newdata = data['clean_sentence'].values
type(newdata)

In [None]:
#data_ready

In [None]:
#data_ready[1]

In [None]:
datablock = pd.Series(data_ready)
datablock.to_frame()
datablock1 = datablock.rename({'0': 'clean_sentence'}, axis=1)

In [None]:
neu1=datablock1.to_frame()
neu1.columns={'clean_sentence'}

In [None]:
neu1

In [None]:
#data1.join(neu1)

In [None]:
neu1['clean_sentence'] = [','.join(map(str, l)) for l in neu1['clean_sentence']]
neu1

In [None]:
import re

In [None]:
#neu1["clean_sentence"] = neu1['clean_sentence'].str.replace([^\w\s]' , '   ')
neu1["clean_sentence"] = neu1['clean_sentence'].str.replace(',' , ' ')                                                           
#neu1['clean_sentence'].str.replace('[{}]'.format(string.punctuation), '  ')
#neu1["new_column"] = neu1['clean_sentence'].str.strip()
neu1

In [None]:
neu2=neu1.join(data1.rating)
neu2

In [None]:
X = neu2['clean_sentence'].to_numpy()
X

In [None]:
pca = PCA(n_components = 2)
principal_components = pca.fit_transform(X)

In [None]:
print(neu2.dtypes)

In [None]:
neu1['clean_sentence'] = neu1['clean_sentence'].astype(int)

In [None]:
df2 = pd.concat([data1.clean_sentence, neu1, data1.rating], axis=1)
df2

In [None]:
df2.to_excel(r'/Users/franziskascharpf/Desktop/Finalcode/\Vergleich_proprocessing.xlsx', index = False)

In [None]:
type(neu2)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [None]:
# array erstellen mit meinen Daten
#documents = data1['clean_sentence'].values.astype("U")
documents = neu2['clean_sentence'].values.astype("U")
#documents = data1['clean_sentence']
#documents = arr

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
print(vectorizer)

In [None]:
features = vectorizer.fit_transform(documents)
print(features)
#features

In [None]:
newfeatures=features.toarray()
#newfeatures

In [None]:
pca = PCA(n_components = 2)
principal_components = pca.fit_transform(newfeatures)

In [None]:
principal_components

In [None]:
kmeans = KMeans(n_clusters=3)
y_kmeans = kmeans.fit_predict(principal_components)

In [None]:
y_kmeans

In [None]:
# plot for data points
plt.scatter(principal_components[:, 0], principal_components[:, 1], c=y_kmeans, cmap='viridis')

# plot for centroids
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black');

In [None]:
import numpy as np

In [None]:
# alternativ der gleiche Plot, diesmal mit Cluster Label:

# unique clusters
u_clusters = np.unique(y_kmeans)

# plot for data points
for i in u_clusters:
    plt.scatter(principal_components[y_kmeans == i, 0], principal_components[y_kmeans == i, 1], label = i)
    
# plot for centroids
plt.scatter(centers[:, 0], centers[:, 1], c='black')

plt.legend();

In [None]:
u_clusters

In [None]:
plt.figure(figsize=(20, 20))
plt.spy(features, markersize=1)

In [None]:
import scipy.sparse as sparse

In [None]:
plt.spy(features)

In [None]:
pd.DataFrame(features.toarray())

In [None]:
k = 6
model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10)
model.fit(features)

In [None]:
neu2['cluster'] = model.labels_

In [None]:
file=neu2['cluster']
file

In [None]:
dffile = pd.DataFrame(data=file)
#dffile

In [None]:
dffile['sentence']=neu2['clean_sentence']

In [None]:
dffile['rating']=neu2['rating']

In [None]:
dffile

In [None]:
dffile.to_excel(r'/Users/franziskascharpf/Desktop/Finalcode/\clusteruebersicht5.xlsx', index = False)

In [None]:
#data

clusters = neu2.groupby('cluster')    

for cluster in clusters.groups:
    f = open('cluster'+str(cluster)+ '.csv', 'w') # create csv file
    data = clusters.get_group(cluster)[['clean_sentence']] # get title and overview columns
    f.write(data.to_csv(index_label='id')) # set index to id
    f.close()

In [None]:
data

In [None]:
print(clusters)

In [None]:
df = pd.DataFrame(data=clusters)
df.head()
df.to_excel(r'/Users/franziskascharpf/Desktop/Finalcode/\clusteruebersicht.xlsx', index = False)

In [None]:
print("Cluster centroids: \n")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(k):
    print("Cluster %d:" % i)
    for j in order_centroids[i, :50]: #print out 10 feature terms of each cluster
        print (' %s' % terms[j])
    print('------------')

In [None]:
 print("Top terms per cluster:")
 order_centroids = km.cluster_centers_.argsort()[:, ::-1]
 terms = vectorizer.get_feature_names()
 for i in range(true_k):
     print("Cluster %d:" % i, end='')
     for ind in order_centroids[i, :10]:
         print(' %s' % terms[ind], end='')
         print()

In [None]:
from scipy.spatial.distance import cdist

In [None]:
import numpy as np

In [None]:
distortions = []
K = range(1,12)

#for k in K:
 #   model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
  #  model.fit(features)
    #kmeanModel = KMeans(n_clusters=k).fit(order_centroids)
    #kmeanModel.fit(order_centroids)
   # distortions.append(sum(np.min(cdist(order_centroids, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / order_centroids.shape[0])

for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(features)
    distortions.append(kmeanModel.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

### Evaluation

In [None]:
from sklearn.metrics import silhouette_score

sil = []
kmax = 15

# dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
for k in range(2, kmax+1):
  kmeans = KMeans(n_clusters = k).fit(features)
  labels = kmeans.labels_
  sil.append(silhouette_score(features, labels, metric = 'euclidean'))

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, sil, 'bx-')
plt.xlabel('k')
plt.ylabel('sil')
plt.title('The shilouette method showing the optimal k')
plt.show()

### Visualisation

In [None]:
from sklearn.manifold import TSNE
fig, ax = plt.subplots(figsize = (10,8))
data = features
kmeans_clustering = KMeans( n_clusters = 6 )
idx = kmeans_clustering.fit_predict( data )

#use t-sne
X = TSNE(n_components=2, perplexity=10).fit_transform( data )

#fig = plt.figure(1)
#plt.clf()

#plot graph
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
plt.scatter(X[:,0], X[:,1], c=colors[kmeans_clustering.labels_])
plt.title('K-Means (t-SNE)')
plt.show()

In [None]:
#Importing required modules
 
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np

In [None]:
#Load Data
data = features
pca = PCA(2)
 
#Transform the data
df = pca.fit_transform(data)
 
df.shape

In [None]:
import matplotlib.pyplot as plt
 
#filter rows of original data
filtered_label0 = df[label == 0]
 
#plotting the results
plt.scatter(filtered_label0[:,0] , filtered_label0[:,1])
plt.show()

In [None]:
#filter rows of original data
filtered_label2 = df[label == 2]
 
filtered_label8 = df[label == 8]
 
#Plotting the results
plt.scatter(filtered_label2[:,0] , filtered_label2[:,1] , color = 'red')
plt.scatter(filtered_label8[:,0] , filtered_label8[:,1] , color = 'black')
plt.show()

In [None]:
#Load Data
data = order_centroids
pca = PCA(2)
 
#Transform the data
#df = pca.fit_transform(data)
 
#Import KMeans module
from sklearn.cluster import KMeans
 
#Initialize the class object
kmeans = KMeans(n_clusters= 6)
 
#predict the labels of clusters.
label = kmeans.fit_predict(data)
 
#Getting unique labels
u_labels = np.unique(label)
 
#plotting the results:
for i in u_labels:
    plt.scatter(data[label == i , 0] , data[label == i , 1] , label = i)
plt.legend()
plt.show()

In [None]:
model.labels_

In [None]:
from sklearn.cluster import KMeans
import numpy as np

In [None]:
#6 Visualising the clusters
plt.scatter(documents[model.labels_==0, 0], documents[model.labels_==0, 1], s=100, c='red', label ='Cluster 1')
plt.scatter(documents[model.labels_==1, 0], documents[model.labels_==1, 1], s=100, c='blue', label ='Cluster 2')
plt.scatter(documents[model.labels_==2, 0], documents[model.labels_==2, 1], s=100, c='green', label ='Cluster 3')
plt.scatter(documents[model.labels_==3, 0], documents[model.labels_==3, 1], s=100, c='cyan', label ='Cluster 4')
plt.scatter(documents[model.labels_==4, 0], documents[model.labels_==4, 1], s=100, c='magenta', label ='Cluster 5')

#Plot the centroid. This time we're going to use the cluster centres  #attribute that returns here the coordinates of the centroid.
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label = 'Centroids')
plt.title('Clusters of Customers')
plt.xlabel('Annual Income(k$)')
plt.ylabel('Spending Score(1-100')
plt.show()

In [None]:
#Load Data
data = terms
pca = PCA(2)
 
#Transform the data
#df = pca.fit_transform(data)
 
#Import KMeans module
from sklearn.cluster import KMeans
 
#Initialize the class object
kmeans = KMeans(n_clusters= 6)
 
#predict the labels of clusters.
label = kmeans.fit_predict(data)
 
#Getting unique labels
u_labels = np.unique(label)
 
#plotting the results:
for i in u_labels:
    plt.scatter(data[label == i , 0] , data[label == i , 1] , label = i)
plt.legend()
plt.show()

In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

# The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
silhouette_avg = silhouette_score(features, cluster_labels)
print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg,)

    # Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(features, cluster_labels)

y_lower = 10
for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
    ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = cm.nipy_spectral(float(i) / n_clusters)
    ax1.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        ith_cluster_silhouette_values,
        facecolor=color,
        edgecolor=color,
        alpha=0.7,)

        # Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
y_lower = y_upper + 10  # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

ax1.set_yticks([])  # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k" )

    # Labeling the clusters
centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
ax2.scatter(centers[:, 0],centers[:, 1], marker="o",c="white",alpha=1, s=200, edgecolor="k",)

for i, c in enumerate(centers):
    ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle("Silhouette analysis for KMeans clustering on sample data with n_clusters = %d" % n_clusters,fontsize=14,fontweight="bold",)

plt.show()

### Evaluation clustering

In [None]:
from sklearn.metrics.cluster import homogeneity_score
from sklearn.metrics.cluster import completeness_score
from sklearn.metrics.cluster import v_measure_score

Homogeneity

In [None]:
homogeneity_score(features)

Completeness

V-measure

__________