# TEXT MINING PROJECT

### Morales Emanuele

In [1]:
# import libraries
import pandas as pd
from pandas import DataFrame
from nltk.corpus import stopwords
import string
import nltk

import numpy as np
from nltk import *
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.decomposition import LatentDirichletAllocation

from ftfy import fix_encoding
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.summarization.summarizer import summarize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emamo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emamo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emamo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## PART A: Text data pre-processing on Facebook comments

In [2]:
#Import dataframe
df = pd.read_csv("fb_sentiment.csv") 

In [3]:
# Select just the column of interest (comments)

first_column = df.columns[0]
second_column = df.columns[2]

df = df.drop([first_column], axis=1)
df = df.drop([second_column], axis=1)

In [4]:
df

Unnamed: 0,FBPost
0,Drug Runners and a U.S. Senator have somethin...
1,"Heres a single, to add, to Kindle. Just read t..."
2,If you tire of Non-Fiction.. Check out http://...
3,Ghost of Round Island is supposedly nonfiction.
4,Why is Barnes and Nobles version of the Kindle...
...,...
995,I liked it. Its youth oriented and I think th...
996,"I think the point of the commercial is that, e..."
997,Kindle 3 is such a great product. I could not ...
998,develop a way to share books! that is a big d...


## Task 1: 
### 1- Clean the corpus by eliminating punctuation and stop words

In [5]:
#remove punctuation by using pandas and regex (regular expression):

df["FBPost_nopunct"] = df['FBPost'].str.replace('[^\w\s]',' ')

In [6]:
#Create a list of english stopwords:

stop_words_en = stopwords.words("english") 
stop_words_en

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [7]:
# Add a column to dataframe without stopwords and punctuations and uppercase letters
nrows = df.shape[0]
stopwords_cleaned = []

for x in range(nrows):
    out1 = [w for w in nltk.word_tokenize(df['FBPost_nopunct'][x].lower()) if w not in stop_words_en]
    out2 = ' '.join(out1).strip()
    stopwords_cleaned.append(out2)
    
df["FB_nostopwords"] = stopwords_cleaned

In [8]:
df

Unnamed: 0,FBPost,FBPost_nopunct,FB_nostopwords
0,Drug Runners and a U.S. Senator have somethin...,Drug Runners and a U S Senator have somethin...,drug runners u senator something murder http w...
1,"Heres a single, to add, to Kindle. Just read t...",Heres a single to add to Kindle Just read t...,heres single add kindle read 19th century stor...
2,If you tire of Non-Fiction.. Check out http://...,If you tire of Non Fiction Check out http ...,tire non fiction check http www amazon com ref...
3,Ghost of Round Island is supposedly nonfiction.,Ghost of Round Island is supposedly nonfiction,ghost round island supposedly nonfiction
4,Why is Barnes and Nobles version of the Kindle...,Why is Barnes and Nobles version of the Kindle...,barnes nobles version kindle much expensive ki...
...,...,...,...
995,I liked it. Its youth oriented and I think th...,I liked it Its youth oriented and I think th...,liked youth oriented think widen appeal
996,"I think the point of the commercial is that, e...",I think the point of the commercial is that e...,think point commercial even borders closing ma...
997,Kindle 3 is such a great product. I could not ...,Kindle 3 is such a great product I could not ...,kindle 3 great product could happier mine
998,develop a way to share books! that is a big d...,develop a way to share books that is a big d...,develop way share books big drawback love kind...


### 2- Tokenize it.

In [9]:
#Number of rows of dataframe
nrows = df.shape[0]

In [10]:
#Create a vector of list containig the tokenized comments and insert a column in dataset containing the comments tokenized.

tok = []

for x in range(nrows):
    comment = nltk.word_tokenize(df["FB_nostopwords"][x])
    tok.append(comment) 
    
df["tokenised"] = tok

In [11]:
df

Unnamed: 0,FBPost,FBPost_nopunct,FB_nostopwords,tokenised
0,Drug Runners and a U.S. Senator have somethin...,Drug Runners and a U S Senator have somethin...,drug runners u senator something murder http w...,"[drug, runners, u, senator, something, murder,..."
1,"Heres a single, to add, to Kindle. Just read t...",Heres a single to add to Kindle Just read t...,heres single add kindle read 19th century stor...,"[heres, single, add, kindle, read, 19th, centu..."
2,If you tire of Non-Fiction.. Check out http://...,If you tire of Non Fiction Check out http ...,tire non fiction check http www amazon com ref...,"[tire, non, fiction, check, http, www, amazon,..."
3,Ghost of Round Island is supposedly nonfiction.,Ghost of Round Island is supposedly nonfiction,ghost round island supposedly nonfiction,"[ghost, round, island, supposedly, nonfiction]"
4,Why is Barnes and Nobles version of the Kindle...,Why is Barnes and Nobles version of the Kindle...,barnes nobles version kindle much expensive ki...,"[barnes, nobles, version, kindle, much, expens..."
...,...,...,...,...
995,I liked it. Its youth oriented and I think th...,I liked it Its youth oriented and I think th...,liked youth oriented think widen appeal,"[liked, youth, oriented, think, widen, appeal]"
996,"I think the point of the commercial is that, e...",I think the point of the commercial is that e...,think point commercial even borders closing ma...,"[think, point, commercial, even, borders, clos..."
997,Kindle 3 is such a great product. I could not ...,Kindle 3 is such a great product I could not ...,kindle 3 great product could happier mine,"[kindle, 3, great, product, could, happier, mine]"
998,develop a way to share books! that is a big d...,develop a way to share books that is a big d...,develop way share books big drawback love kind...,"[develop, way, share, books, big, drawback, lo..."


### 3- Try to obtain bi-grams.

In [12]:
#It creates a sequence of two adjacents words for each of the comments

bigrams_vec=[]

for x in range(nrows):
    bigrams = nltk.ngrams(df["tokenised"][x], n = 2)
    bigrams_vec.append(bigrams)

df["bigrams"] = bigrams_vec

In [13]:
df

Unnamed: 0,FBPost,FBPost_nopunct,FB_nostopwords,tokenised,bigrams
0,Drug Runners and a U.S. Senator have somethin...,Drug Runners and a U S Senator have somethin...,drug runners u senator something murder http w...,"[drug, runners, u, senator, something, murder,...",<generator object ngrams at 0x000001B6E13DD9E0>
1,"Heres a single, to add, to Kindle. Just read t...",Heres a single to add to Kindle Just read t...,heres single add kindle read 19th century stor...,"[heres, single, add, kindle, read, 19th, centu...",<generator object ngrams at 0x000001B6E13DDF20>
2,If you tire of Non-Fiction.. Check out http://...,If you tire of Non Fiction Check out http ...,tire non fiction check http www amazon com ref...,"[tire, non, fiction, check, http, www, amazon,...",<generator object ngrams at 0x000001B6E13DDAC0>
3,Ghost of Round Island is supposedly nonfiction.,Ghost of Round Island is supposedly nonfiction,ghost round island supposedly nonfiction,"[ghost, round, island, supposedly, nonfiction]",<generator object ngrams at 0x000001B6E13DDE40>
4,Why is Barnes and Nobles version of the Kindle...,Why is Barnes and Nobles version of the Kindle...,barnes nobles version kindle much expensive ki...,"[barnes, nobles, version, kindle, much, expens...",<generator object ngrams at 0x000001B6E13DD890>
...,...,...,...,...,...
995,I liked it. Its youth oriented and I think th...,I liked it Its youth oriented and I think th...,liked youth oriented think widen appeal,"[liked, youth, oriented, think, widen, appeal]",<generator object ngrams at 0x000001B6E14E22E0>
996,"I think the point of the commercial is that, e...",I think the point of the commercial is that e...,think point commercial even borders closing ma...,"[think, point, commercial, even, borders, clos...",<generator object ngrams at 0x000001B6E14E2350>
997,Kindle 3 is such a great product. I could not ...,Kindle 3 is such a great product I could not ...,kindle 3 great product could happier mine,"[kindle, 3, great, product, could, happier, mine]",<generator object ngrams at 0x000001B6E14E23C0>
998,develop a way to share books! that is a big d...,develop a way to share books that is a big d...,develop way share books big drawback love kind...,"[develop, way, share, books, big, drawback, lo...",<generator object ngrams at 0x000001B6E14E2430>


In [14]:
#Access to a specific Bigram in the dataset

for bigram in df["bigrams"][2]:
    print(bigram)

('tire', 'non')
('non', 'fiction')
('fiction', 'check')
('check', 'http')
('http', 'www')
('www', 'amazon')
('amazon', 'com')
('com', 'ref')
('ref', 'nb_sb_noss')
('nb_sb_noss', 'url')
('url', 'search')
('search', 'alias')
('alias', '3daps')
('3daps', 'field')
('field', 'keywords')
('keywords', 'danielle')
('danielle', 'lee')
('lee', 'zwissler')
('zwissler', 'x')
('x', '0')
('0', '0')


## Task 2: 
### 1- Split the original corpus in sentences.

In [15]:
#It creates a vector of lists containing the sentences composing each comments 

sentences_vec=[]

for x in range(nrows):
    sent = nltk.sent_tokenize(df["FBPost"][x])
    sentences_vec.append(sent)

In [16]:
df["sentences"] = sentences_vec

In [17]:
df

Unnamed: 0,FBPost,FBPost_nopunct,FB_nostopwords,tokenised,bigrams,sentences
0,Drug Runners and a U.S. Senator have somethin...,Drug Runners and a U S Senator have somethin...,drug runners u senator something murder http w...,"[drug, runners, u, senator, something, murder,...",<generator object ngrams at 0x000001B6E13DD9E0>,"[Drug Runners and a U.S., Senator have someth..."
1,"Heres a single, to add, to Kindle. Just read t...",Heres a single to add to Kindle Just read t...,heres single add kindle read 19th century stor...,"[heres, single, add, kindle, read, 19th, centu...",<generator object ngrams at 0x000001B6E13DDF20>,"[Heres a single, to add, to Kindle., Just read..."
2,If you tire of Non-Fiction.. Check out http://...,If you tire of Non Fiction Check out http ...,tire non fiction check http www amazon com ref...,"[tire, non, fiction, check, http, www, amazon,...",<generator object ngrams at 0x000001B6E13DDAC0>,"[If you tire of Non-Fiction.., Check out http:..."
3,Ghost of Round Island is supposedly nonfiction.,Ghost of Round Island is supposedly nonfiction,ghost round island supposedly nonfiction,"[ghost, round, island, supposedly, nonfiction]",<generator object ngrams at 0x000001B6E13DDE40>,[Ghost of Round Island is supposedly nonfiction.]
4,Why is Barnes and Nobles version of the Kindle...,Why is Barnes and Nobles version of the Kindle...,barnes nobles version kindle much expensive ki...,"[barnes, nobles, version, kindle, much, expens...",<generator object ngrams at 0x000001B6E13DD890>,[Why is Barnes and Nobles version of the Kindl...
...,...,...,...,...,...,...
995,I liked it. Its youth oriented and I think th...,I liked it Its youth oriented and I think th...,liked youth oriented think widen appeal,"[liked, youth, oriented, think, widen, appeal]",<generator object ngrams at 0x000001B6E14E22E0>,"[I liked it., Its youth oriented and I think t..."
996,"I think the point of the commercial is that, e...",I think the point of the commercial is that e...,think point commercial even borders closing ma...,"[think, point, commercial, even, borders, clos...",<generator object ngrams at 0x000001B6E14E2350>,"[I think the point of the commercial is that, ..."
997,Kindle 3 is such a great product. I could not ...,Kindle 3 is such a great product I could not ...,kindle 3 great product could happier mine,"[kindle, 3, great, product, could, happier, mine]",<generator object ngrams at 0x000001B6E14E23C0>,"[Kindle 3 is such a great product., I could no..."
998,develop a way to share books! that is a big d...,develop a way to share books that is a big d...,develop way share books big drawback love kind...,"[develop, way, share, books, big, drawback, lo...",<generator object ngrams at 0x000001B6E14E2430>,"[develop a way to share books!, that is a big ..."


### 2.1 - Bag of words

In [18]:
#The corpus is composed by the comments without punctuation, stopwords and uppercase letters
corpus = df["FB_nostopwords"]

In [19]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['00', '10', '100', '10oz', '11', '12', '1295910892', '1298310486', '14', '18', '19', '1900', '19th', '1st', '20', '200', '2010', '2011', '2012', '20th', '23', '24', '25', '2nd', '30', '321', '3500', '39', '3daps', '3g', '41', '49', '4th', '500', '5th', '5us', '60', '75', '79', '866', '87', '8851', '90', '94', '99', '99cents', 'aa', 'ability', 'able', 'abroad', 'absolute', 'absolutely', 'access', 'accessories', 'accident', 'accidentally', 'accidently', 'accomodate', 'accompanying', 'account', 'ache', 'across', 'acting', 'activities', 'actual', 'actually', 'ad', 'add', 'addicted', 'addicting', 'addiction', 'addictive', 'adjust', 'adjustments', 'adopt', 'adore', 'adoro', 'adquirido', 'adult', 'adults', 'adventure', 'advert', 'advertise', 'advertised', 'advertisement', 'advertising', 'advise', 'aesop', 'afair', 'affair', 'afford', 'afordable', 'afraid', 'africa', 'afterwards', 'ag56twvu5xwc2', 'agenda', 'ages', 'aggravating', 'ago', 'agree', 'agreed', 'airplane', 'al', 'albeit', 'alerts',

In [20]:
#Print the "Bag of words"
X = vectorizer.fit_transform(corpus)
print(X.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### 2.2 - TF-IDF

In [21]:
#Print the term frequency - inverse document frequency, that measures the importance of a term 
#with respect to a document or a collection of documents

vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(corpus)
#print(vectorizer.get_feature_names())

In [22]:
print(tfidf.shape)

(1000, 2767)


In [23]:
print(tfidf.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.25208385 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


### 2.3 - Document-term Matrix

In [24]:
print(X.toarray()) #Bag of words

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [25]:
print(tfidf.toarray()) #TF-IDF

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.25208385 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


## Task 3: Try to create a pipeline for implementing Task 1, parts 1 and 2:

In [26]:
df = pd.read_csv("fb_sentiment.csv")

comments = df["FBPost"]

In [27]:
len(df)

1000

In [28]:
#Create a function for the elimination of punctuation, stopwords and uppercase letters

def Clean(data, language = "english"):
    
    data = data.str.replace('[^\w\s]',' ')
    
    stop_words_en = stopwords.words(language) 
    
    nrows = len(data)
    stopwords_cleaned = []

    for x in range(nrows):
        out1 = [w for w in nltk.word_tokenize(data[x].lower()) if w not in stop_words_en]
        out2 = ' '.join(out1).strip()
        stopwords_cleaned.append(out2)
        
    return pd.Series(stopwords_cleaned)

In [29]:
#Create a function for the tokenization

def Tokenise(stopwords_cleaned):

    tok = []
    
    nrows = len(stopwords_cleaned)

    for x in range(nrows):
        comment = nltk.word_tokenize(stopwords_cleaned[x])
        tok.append(comment) 
    
    return pd.Series(tok)

In [30]:
#Create the pipeline
comments.pipe(Clean).pipe(Tokenise)

0      [drug, runners, u, senator, something, murder,...
1      [heres, single, add, kindle, read, 19th, centu...
2      [tire, non, fiction, check, http, www, amazon,...
3         [ghost, round, island, supposedly, nonfiction]
4      [barnes, nobles, version, kindle, much, expens...
                             ...                        
995       [liked, youth, oriented, think, widen, appeal]
996    [think, point, commercial, even, borders, clos...
997    [kindle, 3, great, product, could, happier, mine]
998    [develop, way, share, books, big, drawback, lo...
999                                       [love, kindle]
Length: 1000, dtype: object

# PART B  (Classification and clustering, topic model and summarisation)

## Task 1: Perform classification and clustering and provide comments (within your Python code) on your results (commenting your code).

To perform these tasks it is used a labeled dataset containing SMS that are labeled with "1" if it is a spam SMS or "0" viceversa.

In [31]:
#import dataset and fix encoding

df = pd.read_csv("SMSCollection.csv", sep = ";", encoding='utf-8')

for i in range(len(df)-1):
        
    df["Content"][i] = fix_encoding(df["Content"][i])

df = df[["Content", "Spam"]]
df = df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Content"][i] = fix_encoding(df["Content"][i])


In [32]:
df['Spam'].value_counts()

0    4825
1     747
Name: Spam, dtype: int64

## 1.1. Classification

In the following lines I created a classification function taking as input a corpus and the actual classification of the documents and splits the dataset into a training part and a test part (10% of the dataset), train a logistic regression model on the training set and test it on the test set. 

The function returns the predictions of the model on the test set and the level of accuracy.

In the function it is also added the clean function implemented in the previous task to clean the data (remove punctuation, remove stopwords and get all the words lower case)

Particularly in this case the function is applied on the SMSCollection dataset, a logistic regression model is trained and it is created binary classifier for recognizing Spam/not Spam messages. For the logistic regression it is use the default cutoff value = 0.5. 

As we can see from the output the logistic model is able to recognize almost all the Spam messages with an accuracy of 98%. The accuracy represent the proportion of istances that are correctly classified.

In [33]:
#Create a function that takes as input the text and the classification, split them into training and test set,
#and return a dataframe with the actual and the predicted values and the accuracy of the model:

def clean_log_classification(comments, classification):

    #Clean the text using the function created in the previous task
    comments = Clean(comments)
    
    # Split dataset in training and test set. The length of the test set is the 10% of the length of the entire dataset.
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(comments, classification, test_size = 0.1, random_state = 1)
    
    #Creation of the TF-IDF features 
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train_raw)

    #Creation of the logistic regression object and training of the model.
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    
    #Testing the model on the test set
    X_test = vectorizer.transform(X_test_raw)
    predictions = classifier.predict(X_test)
    
    #Create a dataframe to visualize the original value of "spam" of test set and the predicted one
    predictions = pd.Series(predictions)
    pred = (pd.DataFrame({'Test':X_test_raw, 'Value':y_test})).reset_index()
    pred["predictions"] = predictions
    
    return pred, accuracy_score(pred["Value"], pred["predictions"]) 

In [34]:
#Application of the function by passing as parameters the column of the message dataset.
clean_log_classification(df["Content"], df["Spam"])

(     index                                               Test  Value  \
 0     1078                               yep pretty sculpture      0   
 1     4028                       yes princess going make moan      0   
 2      958                            welp apparently retired      0   
 3     4642                                             havent      0   
 4     4674  forgot 2 ask ü smth card da present lei ü want...      0   
 ..     ...                                                ...    ...   
 553   3529  1000 winner guaranteed caller prize final atte...      1   
 554   5488                                             k sent      0   
 555   5134                 sday joined training started today      0   
 556      5  freemsg hey darling 3 week word back like fun ...      1   
 557   1289                                   happy new year u      0   
 
      predictions  
 0              0  
 1              0  
 2              0  
 3              0  
 4              0  
 .

## 1.2. Clustering

In the following lines, K-means clustering is applied on the SMSCollection dataset. Since clustering is an unsupervised method, I do not pass to the function the actual classification of the messages, but only their text. 

It is applied the clean function created in the previous task on the text of the messages to clean the data (remove punctuation, remove stopwords and get all the words lower case)

The function applied to the dataset returns 2 unbalanced clusters, the first containg 5103 components and the other containing 469 components.

For each clusters the first 10 components of the centroids are printed: observing the centroids printed in this part, it seems that the centorid of second cluster contains words more related to advertisment and selling activity (like, free, prize, claim, mobile) that could be related to the Spam messages.

The messages with the cluster attributed by k-means clustering is then printed in a data frame format.

In [35]:
5103#import dataset and fix encoding

df = pd.read_csv("SMSCollection.csv", sep = ";", encoding='utf-8')

for i in range(len(df)-1):
        
    df["Content"][i] = fix_encoding(df["Content"][i])

df = df[["Content", "Spam"]]
df = df.dropna()
text = df["Content"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Content"][i] = fix_encoding(df["Content"][i])


In [36]:
#Clean the text using the function created in the previous task and set the number of clusters desired
comments = Clean(text)
num_k = 2

#Create the vectorizer using TfIdf vectorizer to transform the messages

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(comments)
    
#Implement the k-means clustering algorithm
model = KMeans(n_clusters=num_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
 
    
#Retrieve the centroids and the features
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

#for the two classes print the centroids 
for i in range(num_k):
    print("Cluster %d:" %i),
    for ind in order_centroids[i, :10]:
        print(' %s' %terms[ind])

Cluster 0:
 ok
 get
 come
 gt
 lt
 ur
 good
 go
 got
 know
Cluster 1:
 call
 sorry
 later
 free
 prize
 please
 claim
 mobile
 urgent
 contact


In [37]:
#Dataframe of the messages and the correspondent clusters
df = DataFrame(list(zip(model.fit_predict(X), comments)), columns = ["Cluster","Message"])
df

Unnamed: 0,Cluster,Message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,0,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor u c already say
4,0,nah think goes usf lives around though
...,...,...
5567,0,2nd time tried 2 contact u u 750 pound prize 2...
5568,0,ü b going esplanade fr home
5569,0,pity mood suggestions
5570,0,guy bitching acted like interested buying some...


In [38]:
df['Cluster'].value_counts()

0    5345
1     227
Name: Cluster, dtype: int64

##  Task 2: Perform topic model and provide comments on your results.

In the following lines, topic model is performed on the SMSCollection dataset in order to analyze the text of the messages and determine clusters of similar words (topics).

In this case it is used Latent Dirichlet Allocation (LDA) method with three topics.
Also here the clean function created in the previous task is applied to clean the text.

In [39]:
#import dataset and fix encoding

df = pd.read_csv("SMSCollection.csv", sep = ";", encoding='utf-8')

for i in range(len(df)-1):
        
    df["Content"][i] = fix_encoding(df["Content"][i])

df = df[["Content", "Spam"]]
df = df.dropna()

text = df["Content"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Content"][i] = fix_encoding(df["Content"][i])


In [40]:
#Set a seed
seed = 10
np.random.seed(seed)

In [41]:
#application of the clean function created in the prevous task
text = Clean(text)

In [42]:
cv=CountVectorizer()
cv_features=cv.fit_transform(text)

In [43]:
# Using sklearn.decomposition LDA with 3 topics
TOTAL_TOPICS=3
lda_model=LatentDirichletAllocation(n_components=TOTAL_TOPICS,max_iter=500,max_doc_update_iter=50,learning_method='online',batch_size=1740,learning_offset=50.,random_state=42,n_jobs=16)

In [44]:
document_topics=lda_model.fit_transform(cv_features)

In [45]:
#print the shape of the document
document_topics.shape

(5572, 3)

In [46]:
vocabulary=np.array(cv.get_feature_names())

In [47]:
#Extract 10 terms for each of the 3 topics

topic_terms=lda_model.components_
top_terms=10 # number of 'top terms'
topic_key_terms_idxs=np.argsort(-np.absolute(topic_terms), axis=1)[:,:top_terms]
topic_keyterms=vocabulary[topic_key_terms_idxs]
topics=[', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth',-1)
topics_df=pd.DataFrame(topics,columns=['Term per Topic'], index=['Topic'+str(t) for t in range(1,TOTAL_TOPICS+1)])
topics_df

  pd.set_option('display.max_colwidth',-1)


Unnamed: 0,Term per Topic
Topic1,"get, know, go, good, day, got, ok, come, love, time"
Topic2,"call, free, ur, txt, stop, reply, text, claim, mobile, www"
Topic3,"gt, lt, call, ok, free, mins, mobile, holiday, box, latest"


Looking at the words contained in the topics, it seems that the first one refers to messages sent by  users that are  acquaintances or friends, because there are words like love, time, ok etc. that generally are sent from people that know each other. So these kind of messages could be related to real users and could be classified as not Spam.

The other two topics seem to be more related to commercial topics, we can find for example "www" that represents the link to a website, that is generally sent by company to promote its website, or words like free, latest, stop, mobile, that are more related to the commercial and advertisement activities. These could represent Spam SMS with commericial purposes not appreciated by the user.

## Task 3: Perform summarisation and provide comments (within your Python code) on your results.

The module summarization.summarizer provides a function for summarizing text. 
It is based on text sentences using a variation of the TextRank algorithm. 
The output consists in the most representative sentences of the text. 
The input is a string and a word_count must be provided to the function to determine how many words the output will contain.

In this example it is provided a text about World War 2 that is correctly summarized by the function with a maximum of 100 words.

In [48]:
#Open a text related to the World War 2 
f = open('ww2.txt', 'r')
content = f.read()

In [49]:
#Application of the summarization function from Gensim
print(summarize(content, word_count=100))

From late 1939 to early 1941, in a series of campaigns and treaties, Germany conquered or controlled much of continental Europe, and formed the Axis alliance with Italy and Japan, along with other countries later on.
The war in Europe concluded with the liberation of German-occupied territories, and the invasion of Germany by the Western Allies and the Soviet Union, culminating in the fall of Berlin to Soviet troops, the suicide of Adolf Hitler and the German unconditional surrender on 8 May 1945.
Meanwhile, the victorious Allies of World War I, such as France, Belgium, Italy, Romania, and Greece, gained territory, and new nation-states were created out of the collapse of Austria-Hungary and the Ottoman and Russian Empires.
