# IFT6390 Project - Remi

##Useful piece of code

In [None]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Setup - data

In [2]:
import pandas as pd
import numpy as np

### Load CSV

In [None]:
s140 = pd.read_csv('data/sentiment140.csv')
cc = pd.read_csv('data/climatechange.csv')
mr = pd.read_csv('data/moviereview.csv')

### Create pickle

In [12]:
s140.to_pickle('data/s140.pkl')
cc.to_pickle('data/cc.pkl')
mr.to_pickle('data/mr.pkl')

### Load Pickle

In [11]:
s140 = pd.read_pickle('data/s140.pkl')
cc = pd.read_pickle('data/cc.pkl')
mr = pd.read_pickle('data/mr.pkl')

## Sentiment140 Data Exploration

In [38]:
print(f"Total of {s140.shape[0]} text samples")
print(s140.groupby('target').count())
print('='*80)
print(s140.head())
print('='*80)
print(s140['target'].value_counts(normalize=True))
print('='*80)
s140_pos = s140[(s140.target == 'positive')]
s140_neg = s140[(s140.target == 'negative')]
s140_neut=s140.loc[s140['target']=='neutral']
print(f"**Only {s140_neut.shape[0]} 'neutral' text samples**")

Total of 1600498 text sample
                                                text    target
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  negative
1  Got a headache :/ MC stop making music, you ca...  negative
2  lol still worked like crazy lol  . lol Your la...  negative
3  why won't netflix send me S. Darko? I know it'...  negative
4  [ToZ] Clan Website offline  http://www.theoutl...  negative
positive    0.499958
negative    0.499955
neutral     0.000087
Name: target, dtype: float64
**Only 139 'neutral' sample comment**


## Transform

In [3]:
#Natural Language Toolkit
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize #creates arrays of words
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

#identifies words which are not adding semantic value to the sentence
stopw = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/rd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/rd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rd/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
import re
from re import sub
import string
from sklearn.feature_extraction.text import CountVectorizer


class clean:
    def url(df:pd.DataFrame) -> pd.DataFrame:
        # SRC -> https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe
        return df.str.replace('http\S+|www.\S+', '_link_', case=False)
    
    def url(s:str) -> str:
        return sub('http\S+|www.\S+', '_link_', s, flags=re.IGNORECASE)
    
    def rm_repeat(w:str) -> str:
        """removes letters repeated 3+ times"""
        return sub(r'(.)\1{2,}', r'\1', w)
    
    #create an array of the words contained in each comment _
    #while removing strings representing numbers and stopwords
    
    def merge(v:np.ndarray) -> str:
        return "".join(w+" " for w in v) #converts vector into string
    
    def convert(s:str) -> str:
        """Returns a vector representation of the sentence"""
        s=clean.url(s)
        s=clean.rm_repeat(s)
        #v=[w for w in word_tokenize(s) if (w not in stopw and len(w) > 1)] # create words
        v=word_tokenize(s)
        return "".join(clean.to_ascii(w)+" " for w in v) #converts vector into string
    
    def to_ascii(w:str) -> str:
        """Keeps ascii-only characters and appends tokens representing 
        strings of digits and non-ascii characters"""
        onlyascii= "".join(i for i in w.lower() if (ord(i) < 48 or (ord(i)> 57 and ord(i)<128)))        
        return onlyascii + clean.notascii(w) + clean.onlynumber(w)
    
    def onlynumber(s:str) -> str:
        """Returns a '_number_' token to represent any string of digits"""
        n="".join(i for i in s if (ord(i) >= 48 and ord(i)<= 57 ))
        if (n !=""):
            return " _number_"
        else:
            return ""

    def notascii(s:str) -> str:
        """Map strings of non-ascii characters to '_notascii_' token"""
        symbol= "".join(i for i in s if ord(i) >= 128)
        if (symbol !=""):
            return " _notascii_"
        else:
            return ""
    
    
    #SRC -> https://www.geeksforgeeks.org/implement-isnumber-function-in-python/
    # Implementation of isNumber() function 
    def isNumber(s): 
        """Considers strings of digits headed with sign characters"""

        # handle for signed values
        negative = False
        if(s[0] =='-' or s[0] =='+'): 
            sign = True

        if sign == True: 
            return clean.isNumber(s[1:]) #handles repeated signs recursively
        else:
            return s.isdigit()

        ## try to convert the string to int 
        #try: 
        #    n = int(s) 
        #    return True
        ## catch exception if cannot be converted 
        #except ValueError: 
        #    return False
    

    def normalize_text(s:str) -> str:
        """removes stop words, words of size 1, symbols and numbers"""
        return [w for w in word_tokenize(clean.convert(s)) if (w not in stopw and len(w) > 1)]

    def lemmatize(s:str) -> str:
        wnl = WordNetLemmatizer()
        return [wnl.lemmatize(w) for w in clean.normalize_text(s)]

    def stem(s:str) -> str:
        ps  = PorterStemmer()
        return [ps.stem(w) for w in clean.normalize_text(s)]
    

In [None]:
s140['lemma'] = s140['text'].apply(clean.lemmatize)

In [81]:
s140['length']=s140['lemma'].apply(lambda x: len(x))

                                                text    target  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  negative   
1  Got a headache :/ MC stop making music, you ca...  negative   
2  lol still worked like crazy lol  . lol Your la...  negative   
3  why won't netflix send me S. Darko? I know it'...  negative   
4  [ToZ] Clan Website offline  http://www.theoutl...  negative   

                                               lemma  length  
0  [switchfoot, _link_, awww, 's, bummer, shoulda...      11  
1  [got, headache, mc, stop, making, music, ca, n...      11  
2  [lol, still, worked, like, crazy, lol, lol, la...      18  
3  [wo, n't, netflix, send, s., darko, know, 's, ...      16  
4              [toz, clan, website, offline, _link_]       5  


## Produce cleaned dataframe for future usage

In [105]:
s140.to_pickle('data/s140_clean.pkl')

In [5]:
s140=pd.read_pickle('data/s140_clean.pkl')

In [103]:
print(s140['length'].describe().astype('int64') )
print('='*80)
print(s140.head())

count    1600498
mean           8
std            4
min            0
25%            5
50%            8
75%           11
max          118
Name: length, dtype: int64
                                                text    target  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  negative   
1  Got a headache :/ MC stop making music, you ca...  negative   
2  lol still worked like crazy lol  . lol Your la...  negative   
3  why won't netflix send me S. Darko? I know it'...  negative   
4  [ToZ] Clan Website offline  http://www.theoutl...  negative   

                                               lemma  length  
0  [switchfoot, _link_, awww, 's, bummer, shoulda...      11  
1  [got, headache, mc, stop, making, music, ca, n...      11  
2  [lol, still, worked, like, crazy, lol, lol, la...      18  
3  [wo, n't, netflix, send, s., darko, know, 's, ...      16  
4              [toz, clan, website, offline, _link_]       5  


In [6]:
s140['trimmed']=s140['lemma'].apply(clean.merge).apply(clean.rm_repeat)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer 

def make_tfidf(df):
    tf_vect=TfidfVectorizer(use_idf=False,stop_words=stopw)
    tf=tf_vect.fit_transform(df.tolist())
    
    l=tf.shape[0]
    
    w_count=np.array(tf.sum(axis=0,))[0]/l
    #wcm=w_count.max()
    #w_count=w_count/wcm
    wcr=w_count.argsort()[::-1].argsort()

    
    idf_vect=TfidfVectorizer(use_idf=True,stop_words=stopw)
    idf=idf_vect.fit_transform(df.tolist())
    
    w_marker=np.array(idf.sum(axis=0))[0]/l
    #wmm=w_marker.max()
    #w_marker=w_marker/wmm
    wmr=w_marker.argsort()[::-1].argsort()
    
    
    # SRC -> https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/
    feature_names=tf_vect.get_feature_names()
    
    tf_idf = pd.DataFrame(np.array([w_count, w_marker, wcr, wmr]).T, index=feature_names, columns=["tf", "idf", "rank_tf", "rank_idf"])
    tf_idf = tf_idf.sort_values(by=["tf"],ascending=False)

    tf_idf.rank_tf=tf_idf.rank_tf.astype('int64')
    tf_idf.rank_idf=tf_idf.rank_idf.astype('int64')

    return tf_idf


In [8]:
all=make_tfidf(s140['trimmed'])

In [9]:
tf_idf_vect=TfidfVectorizer(use_idf=True,stop_words=stopw)
tf_idf=tf_idf_vect.fit_transform(s140['trimmed'].tolist())

In [108]:
s140_pos = s140[(s140.target == 'positive')]
s140_neg = s140[(s140.target == 'negative')]
s140_neut = s140[s140.target == 'neutral']

In [124]:
pos=make_tfidf(s140_pos['trimmed'])
neg=make_tfidf(s140_neg['trimmed'])
neut=make_tfidf(s140_neut['trimmed'])

In [149]:
a=10
b=50
print(f'All documents top words\n{all.head(a)}')
print('='*b)
print(f'Positive documents top words\n{pos.head(a)}')
print('='*b)
print(f'Negative documents top words\n{neg.head(a)}')
print('='*b)
print(f'Neutral documents top words\n{neut.head(a)}')

All documents top words
                tf       idf  rank_tf  rank_idf
_number_  0.078189  0.029137        0         0
day       0.022910  0.013212        1         1
good      0.020175  0.011472        2         2
_link_    0.017899  0.010193        3         4
get       0.017192  0.009713        4         6
go        0.016301  0.009873        5         5
like      0.016169  0.009095        6         8
work      0.015640  0.010381        7         3
love      0.015129  0.008805        8        10
today     0.014996  0.009305        9         7
Positive documents top words
                tf       idf  rank_tf  rank_idf
_number_  0.078936  0.029125        0         0
good      0.027984  0.014872        1         1
_link_    0.024132  0.012931        2         3
day       0.024000  0.013735        3         2
love      0.022817  0.012247        4         4
thanks    0.017382  0.009025        5         6
like      0.015616  0.008702        6         9
lol       0.015239  0.008468       

## Differentiating words

In [223]:
def find_keywords(ar1,ar2,ar3,t=1000):
    """Compare ar1 to ar2 and ar3"""

    ar1_w=np.copy(ar1.index[:]).tolist()
    
    a=np.minimum(len(ar1),t)
    b=np.minimum(len(ar2),t)
    c=np.minimum(len(ar3),t)
    i=0
    
    while i < a:
        if (ar1.index[i] in ar2.index[:b]) or (ar1.index[i] in ar3.index[:c]):
            del ar1_w[i]
            a-=1
            #ar1_w.pop(i)
        else:
            i+=1
    
    return ar1_w

In [224]:
treshold=100
key_pos=find_keywords(pos, neg, neut, treshold)
key_neg=find_keywords(neg, pos, neut, treshold)
key_neut=find_keywords(neut, neg, pos, treshold)

In [225]:
a=20
b=50

print(f'Positive documents top words\n{key_pos[:a]}')
print('='*b)
print(f'Negative documents top words\n{key_neg[:a]}')
print('='*b)
print(f'Neutral documents top words\n{key_neut[:a]}')

Positive documents top words
['though', 'little', 'watch', 'ready', 'excited', 'always', 'sound', 'hi', 'school', 'made', 'pretty', 'long', 'lot', 'looking', 'ur', 'ya', 'hour', 'wow', 'cute', 'beautiful']
Negative documents top words
['soon', 'away', 'cold', 'life', 'little', 'left', 'wo', 'headache', 'guy', 'another', 'man', 'trying', 'great', 'omg', 'nothing', 'early', 'someone', 'wait', '_notascii_', 'poor']
Neutral documents top words
['saw', 'business', 'okay', 'com', 'sb', 'boy', 'store', 'food', 'like', 'ceo', 'ap', 'good', 'oh', 'old', 'fitness', 'need', 'battle', 'warner', 'read', 'breakfast']


## Train s140

In [31]:
import sys
!{sys.executable} -m pip install scikit-plot

import scikitplot#.plotters as skplt



## Model testing

In [10]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

#The mean score and the 95% confidence interval of the score estimate are hence given by:

def TryModel(model, x, y):    
    if True:
        X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.10, random_state=42)
        
        clf = model.fit(X_train, y_train)
        
        predicted = clf.predict(X_valid)

        from sklearn.metrics import accuracy_score
        acc=accuracy_score(y_valid, predicted)
        print(f"Accuracy: {acc}")
    else:
        scores = cross_val_score(model, x, y, cv=5)
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    scikitplot.metrics.plot_confusion_matrix(y_valid, predicted,x_tick_rotation=90,figsize=(9,9))#, normalize=True)
    
    plt.show()

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

x,y = tf_idf, s140['target']

#TryModel(MultinomialNB(), x, y)
#TryModel(linear_model.SGDClassifier(max_iter=1000, tol=1e-3), x, y)
TryModel(MLPClassifier(alpha=0.025, max_iter=25,epsilon=1e-02,verbose=True), x, y)
#TryModel(AdaBoostClassifier(DecisionTreeClassifier(max_depth=10)), x, y)
#vch = VotingClassifier(estimators=[('NB25', clf1), ('SGDp', clf2), ('SGDlog', clf3),('NB35', clf4)], voting='hard')
#vcs = VotingClassifier(estimators=[('NB10', clf1), ('SGD', clf2), ('SGDlog', clf3),('NB35', clf4)], voting='soft')
#TryModel(svm.SVC(kernel='linear', C=1), x, y)


## ClimateChange Analysis

In [18]:
climatechange.shape

(6027, 3)

In [19]:
climatechange.columns

Index(['text', 'confidence', 'target'], dtype='object')

In [20]:
climatechange.head()

Unnamed: 0,text,confidence,target
0,Global warming report urges governments to act...,1.0,Yes
1,Fighting poverty and global warming in Africa ...,1.0,Yes
2,Carbon offsets: How a Vatican forest failed to...,0.8786,Yes
3,Carbon offsets: How a Vatican forest failed to...,1.0,Yes
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,0.8087,Yes


In [83]:
# SRC -> https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
text = climatechange.iloc[:, 0]

# SRC -> https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe
text = text.str.replace('http\S+|www.\S+', '[link]', case=False)

climatechange.iloc[:, 0] = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [22]:
climatechange['target'].value_counts(normalize=True)

Y      0.605207
N      0.248627
Yes    0.132314
No     0.013852
Name: target, dtype: float64

In [23]:
exist = climatechange[(climatechange.target == 'Y') | (climatechange.target == 'Yes')]
not_exist = climatechange[(climatechange.target == 'N') | (climatechange.target == 'No')]


In [24]:
print(exist.describe())

        confidence
count  3088.000000
mean      0.821351
std       0.178079
min       0.343400
25%       0.662800
50%       0.806050
75%       1.000000
max       1.000000


In [25]:
print(not_exist.describe())

        confidence
count  1099.000000
mean      0.762216
std       0.190782
min       0.345100
25%       0.650100
50%       0.688000
75%       1.000000
max       1.000000


In [26]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= exist.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('link', 2640), ('climat', 2001), ('chang', 1896), ('global', 1556), ('warm', 1503), ('rt', 522), ('the', 317), ('via', 281), ('new', 176), ('news', 147)]


In [27]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= not_exist.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('global', 911), ('warm', 904), ('link', 638), ('climat', 367), ('chang', 322), ('rt', 229), ('snow', 155), ('the', 131), ('tcot', 114), ('gore', 99)]


## MovieReview Analysis

In [28]:
moviereview.shape

(50000, 2)

In [29]:
moviereview.head()

Unnamed: 0,text,target
0,Story of a man who has unnatural feelings for ...,negative
1,Airport '77 starts as a brand new luxury 747 p...,negative
2,This film lacked something I couldn't put my f...,negative
3,"Sorry everyone,,, I know this is supposed to b...",negative
4,When I was little my parents took me along to ...,negative


In [74]:
# SRC -> https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
text = moviereview.iloc[:, 0]

text = text.str.replace('<br />', ' ', case=False)

# SRC -> https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe
text = text.str.replace('http\S+|www.\S+', '[link]', case=False)

moviereview.iloc[:, 0] = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [84]:
moviereview['target'].value_counts(normalize=True)

negative    0.5
positive    0.5
Name: target, dtype: float64

In [77]:
good = moviereview[(moviereview.target == 'positive')]
bad = moviereview[(moviereview.target == 'negative')]


In [78]:
print(good.describe())

                                                     text    target
count                                               25000     25000
unique                                              24881         1
top     Loved today's show!!! It variety solely cookin...  positive
freq                                                    5     25000


In [79]:
print(bad.describe())

                                                     text    target
count                                               25000     25000
unique                                              24696         1
top     When got movie free job, along three similar m...  negative
freq                                                    3     25000


In [80]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= good.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print (sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('film', 50894), ('the', 49203), ('movi', 44850), ('it', 32237), ('one', 28290), ('like', 20562), ('thi', 18000), ('time', 16630), ('good', 15262), ('see', 15141)]


In [81]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= bad.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print (sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('movi', 58431), ('the', 49707), ('film', 44988), ('it', 31346), ('one', 27163), ('like', 24648), ('thi', 19196), ('make', 16221), ('even', 15440), ('time', 15335)]


## Grid Search

In [58]:
from sklearn.model_selection import GridSearchCV

param = {
    'vect__analyzer': [stemmed_words],
    'tfidf__norm': ['l2'],
    'clf__tol':[1e-1, 1e-2, 1e-3, 1e-4, 1e-5], 
    'clf__C':[100, 50, 20, 10, 1, 1e-1, 1e-2, 1e-3]
    }

svc = Pipeline([('vect', CountVectorizer(analyzer=stemmed_words)),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC()),
        ])

clf = GridSearchCV(svc, param, cv=5)

clf.fit(data_small.iloc[:,0],data_small.iloc[:,1])


sorted(clf.cv_results_.keys())






['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_clf__C',
 'param_clf__tol',
 'param_tfidf__norm',
 'param_vect__analyzer',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [60]:
print('Best Params: ', clf.best_params_)

Best Params:  {'clf__C': 0.1, 'clf__tol': 0.01, 'tfidf__norm': 'l2', 'vect__analyzer': <function stemmed_words at 0x150652950>}
