# IFT6390 Project
### Jordi

## Import data and libraries

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 
import re

stop_words = set(stopwords.words('english')) 

sentiment140=pd.read_pickle('data/s140_clean_28nov.pkl')
moviereview=pd.read_pickle('data/mr_clean_28nov.pkl')
climatechange=pd.read_pickle('data/cc_clean_28nov.pkl')

## Sentiment140 Analysis

In [2]:
# SRC -> https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
text = sentiment140.iloc[:, 0]

# SRC -> https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe
text = text.str.replace('http\S+|www.\S+', '[link]', case=False)

sentiment140.iloc[:, 0] = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [3]:
good = sentiment140[(sentiment140.target == 'positive')]
bad = sentiment140[(sentiment140.target == 'negative')]

In [7]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= good.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('good', 62840), ('love', 60849), ('go', 59291), ('link', 57941), ('day', 55810), ('thank', 50836), ('it', 50799), ('get', 49201), ('quot', 46959), ('you', 45372)]


In [8]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= bad.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('go', 79289), ('get', 61510), ('work', 59029), ('day', 50487), ('it', 48231), ('miss', 47571), ('like', 43261), ('want', 40272), ('today', 38363), ('feel', 36919)]


## ClimateChange Analysis

In [4]:
# SRC -> https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
text = climatechange.iloc[:, 0]

# SRC -> https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe
text = text.str.replace('http\S+|www.\S+', '[link]', case=False)

climatechange.iloc[:, 0] = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [5]:
exist = climatechange[(climatechange.target == 'positive')]
not_exist = climatechange[(climatechange.target == 'negative')]


In [18]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= exist.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('link', 2640), ('climat', 2001), ('chang', 1896), ('global', 1556), ('warm', 1503), ('rt', 522), ('the', 317), ('via', 281), ('new', 176), ('news', 147)]


In [19]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= not_exist.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('global', 911), ('warm', 904), ('link', 638), ('climat', 367), ('chang', 322), ('rt', 229), ('snow', 155), ('the', 131), ('tcot', 114), ('gore', 99)]


## MovieReview Analysis

In [6]:
# SRC -> https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
text = moviereview.iloc[:, 0]

text = text.str.replace('<br />', ' ', case=False)

# SRC -> https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe
text = text.str.replace('http\S+|www.\S+', '[link]', case=False)

moviereview.iloc[:, 0] = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [7]:
good = moviereview[(moviereview.target == 'positive')]
bad = moviereview[(moviereview.target == 'negative')]


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= good.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print (sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('film', 50894), ('the', 49203), ('movi', 44850), ('it', 32237), ('one', 28290), ('like', 20562), ('thi', 18000), ('time', 16630), ('good', 15262), ('see', 15141)]


In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= bad.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print (sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('movi', 58431), ('the', 49707), ('film', 44988), ('it', 31346), ('one', 27163), ('like', 24648), ('thi', 19196), ('make', 16221), ('even', 15440), ('time', 15335)]


## Define sets

In [42]:
data =  pd.concat([sentiment140, moviereview], ignore_index=True)

X_train = data.iloc[:, 0]
y_train = data.iloc[:, 1]

climatechange_transf = climatechange.dropna()

X_test = climatechange_transf.iloc[:, 0]

y_test = climatechange_transf.iloc[:, 2]


In [43]:
good = data[(data.target == 'positive')]
bad = data[(data.target == 'negative')]

In [44]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= good.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('it', 83036), ('good', 78102), ('love', 73315), ('the', 68713), ('go', 67629), ('get', 61894), ('like', 61134), ('day', 60250), ('link', 58305), ('one', 56599)]


In [46]:
from nltk.probability import FreqDist

gram_good = ngrams(get_all_words(list(good.iloc[:, 0])), 3)
freq_words_tri = FreqDist()
for word in list(gram_good):
    freq_words_tri[word] += 1

print(freq_words_tri.most_common(10))

[(('can', 't', 'wait'), 7781), (('i', 'can', 't'), 4632), (('i', 'm', 'going'), 3805), (('i', 'think', 'i'), 2948), (('i', 'm', 'sure'), 2629), (('i', 'm', 'glad'), 1849), (('m', 'gon', 'na'), 1842), (('i', 'm', 'gon'), 1841), (('i', 'know', 'i'), 1625), (('i', 've', 'seen'), 1556)]


In [45]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= bad.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('go', 89204), ('it', 79577), ('get', 76789), ('like', 67909), ('the', 66568), ('work', 65297), ('movi', 64364), ('one', 56201), ('day', 53894), ('miss', 49554)]


In [49]:
from nltk.probability import FreqDist

gram_bad = ngrams(get_all_words(list(bad.iloc[:, 0])), 3)
freq_words_tri = FreqDist()
for word in list(gram_bad):
    freq_words_tri[word] += 1

print(freq_words_tri.most_common(10))

[(('i', 'can', 't'), 13914), (('i', 'wish', 'i'), 4861), (('i', 'm', 'sorry'), 4691), (('i', 'wan', 'na'), 4654), (('i', 'm', 'going'), 4545), (('i', 'think', 'i'), 4395), (('wan', 'na', 'go'), 3459), (('wish', 'i', 'could'), 3355), (('can', 't', 'believe'), 2906), (('i', 'm', 'gon'), 2710)]


## Small dataset

In [37]:
data_small = data.sample(100000)

In [38]:
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

def get_all_words(corpus):
    list_all_words = []
    
    for i in range(0, len(corpus)):
        sentence = re.sub("[^A-Za-z]+", " ",  corpus[i])
        list_all_words.extend(word_tokenize(sentence.lower()))
    return list_all_words

gram = ngrams(get_all_words(list(data_small.iloc[:, 0])), 3)

In [39]:
from nltk.probability import FreqDist

freq_words_tri = FreqDist()
for word in list(gram):
    freq_words_tri[word] += 1

print(freq_words_tri.most_common(10))

[(('i', 'can', 't'), 1125), (('can', 't', 'wait'), 578), (('i', 'm', 'going'), 516), (('i', 'think', 'i'), 437), (('i', 'wish', 'i'), 361), (('i', 'wan', 'na'), 356), (('i', 'm', 'sure'), 283), (('i', 'm', 'sorry'), 278), (('wish', 'i', 'could'), 261), (('i', 'know', 'i'), 260)]


In [None]:
svc = Pipeline([('vect', CountVectorizer(analyzer=stemmed_words)),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC(C=0.1, tol=0.1)),
        ])

svc.fit(data_small.iloc[:,0],data_small.iloc[:,1])


y_pred = svc.predict(X_test)

print(accuracy_score(y_test, y_pred))

In [58]:
from sklearn.model_selection import GridSearchCV

param = {
    'vect__analyzer': [stemmed_words],
    'tfidf__norm': ['l2'],
    'clf__tol':[1e-1, 1e-2, 1e-3, 1e-4, 1e-5], 
    'clf__C':[100, 50, 20, 10, 1, 1e-1, 1e-2, 1e-3]
    }

svc = Pipeline([('vect', CountVectorizer(analyzer=stemmed_words)),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC()),
        ])

clf = GridSearchCV(svc, param, cv=5)

clf.fit(data_small.iloc[:,0],data_small.iloc[:,1])


sorted(clf.cv_results_.keys())






['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_clf__C',
 'param_clf__tol',
 'param_tfidf__norm',
 'param_vect__analyzer',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [60]:
print('Best Params: ', clf.best_params_)

Best Params:  {'clf__C': 0.1, 'clf__tol': 0.01, 'tfidf__norm': 'l2', 'vect__analyzer': <function stemmed_words at 0x150652950>}
