# IFT6390 Project
### Jordi

## Import data and libraries

In [73]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 
import re

stop_words = set(stopwords.words('english')) 

sentiment140 = pd.read_csv('data/sentiment140.csv')
climatechange = pd.read_csv('data/climatechange.csv')
moviereview = pd.read_csv('data/moviereview.csv')

## Sentiment140 Analysis

###### sentiment140.shape

In [2]:
sentiment140.head()

Unnamed: 0,text,target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative
1,"Got a headache :/ MC stop making music, you ca...",negative
2,lol still worked like crazy lol . lol Your la...,negative
3,why won't netflix send me S. Darko? I know it'...,negative
4,[ToZ] Clan Website offline http://www.theoutl...,negative


In [82]:
# SRC -> https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
text = sentiment140.iloc[:, 0]

# SRC -> https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe
text = text.str.replace('http\S+|www.\S+', '[link]', case=False)

sentiment140.iloc[:, 0] = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [11]:
sentiment140.head()

Unnamed: 0,text,target
0,"@switchfoot [link] - Awww, that's bummer. You ...",negative
1,"Got headache :/ MC stop making music, can't si...",negative
2,lol still worked like crazy lol . lol Your lak...,negative
3,netflix send S. Darko? I know going terrible I...,negative
4,[ToZ] Clan Website offline [link],negative


In [12]:
sentiment140['target'].value_counts(normalize=True)

positive    0.499958
negative    0.499955
neutral     0.000087
Name: target, dtype: float64

In [13]:
good = sentiment140[(sentiment140.target == 'positive')]
bad = sentiment140[(sentiment140.target == 'negative')]

In [14]:
print(good.describe())

                text    target
count         800182    800182
unique        791421         1
top     good morning  positive
freq             131    800182


In [15]:
print(bad.describe())

                            text    target
count                     800177    800177
unique                    786364         1
top     isPlayer Has Died! Sorry  negative
freq                         210    800177


In [16]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= good.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('good', 62840), ('love', 60849), ('go', 59291), ('link', 57941), ('day', 55810), ('thank', 50836), ('it', 50799), ('get', 49201), ('quot', 46959), ('you', 45372)]


In [17]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= bad.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('go', 79289), ('get', 61510), ('work', 59029), ('day', 50487), ('it', 48231), ('miss', 47571), ('like', 43261), ('want', 40272), ('today', 38363), ('feel', 36919)]


## ClimateChange Analysis

In [18]:
climatechange.shape

(6027, 3)

In [19]:
climatechange.columns

Index(['text', 'confidence', 'target'], dtype='object')

In [20]:
climatechange.head()

Unnamed: 0,text,confidence,target
0,Global warming report urges governments to act...,1.0,Yes
1,Fighting poverty and global warming in Africa ...,1.0,Yes
2,Carbon offsets: How a Vatican forest failed to...,0.8786,Yes
3,Carbon offsets: How a Vatican forest failed to...,1.0,Yes
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,0.8087,Yes


In [83]:
# SRC -> https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
text = climatechange.iloc[:, 0]

# SRC -> https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe
text = text.str.replace('http\S+|www.\S+', '[link]', case=False)

climatechange.iloc[:, 0] = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [22]:
climatechange['target'].value_counts(normalize=True)

Y      0.605207
N      0.248627
Yes    0.132314
No     0.013852
Name: target, dtype: float64

In [23]:
exist = climatechange[(climatechange.target == 'Y') | (climatechange.target == 'Yes')]
not_exist = climatechange[(climatechange.target == 'N') | (climatechange.target == 'No')]


In [24]:
print(exist.describe())

        confidence
count  3088.000000
mean      0.821351
std       0.178079
min       0.343400
25%       0.662800
50%       0.806050
75%       1.000000
max       1.000000


In [25]:
print(not_exist.describe())

        confidence
count  1099.000000
mean      0.762216
std       0.190782
min       0.345100
25%       0.650100
50%       0.688000
75%       1.000000
max       1.000000


In [26]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= exist.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('link', 2640), ('climat', 2001), ('chang', 1896), ('global', 1556), ('warm', 1503), ('rt', 522), ('the', 317), ('via', 281), ('new', 176), ('news', 147)]


In [27]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= not_exist.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('global', 911), ('warm', 904), ('link', 638), ('climat', 367), ('chang', 322), ('rt', 229), ('snow', 155), ('the', 131), ('tcot', 114), ('gore', 99)]


## MovieReview Analysis

In [28]:
moviereview.shape

(50000, 2)

In [29]:
moviereview.head()

Unnamed: 0,text,target
0,Story of a man who has unnatural feelings for ...,negative
1,Airport '77 starts as a brand new luxury 747 p...,negative
2,This film lacked something I couldn't put my f...,negative
3,"Sorry everyone,,, I know this is supposed to b...",negative
4,When I was little my parents took me along to ...,negative


In [74]:
# SRC -> https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
text = moviereview.iloc[:, 0]

text = text.str.replace('<br />', ' ', case=False)

# SRC -> https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe
text = text.str.replace('http\S+|www.\S+', '[link]', case=False)

moviereview.iloc[:, 0] = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [84]:
moviereview['target'].value_counts(normalize=True)

negative    0.5
positive    0.5
Name: target, dtype: float64

In [77]:
good = moviereview[(moviereview.target == 'positive')]
bad = moviereview[(moviereview.target == 'negative')]


In [78]:
print(good.describe())

                                                     text    target
count                                               25000     25000
unique                                              24881         1
top     Loved today's show!!! It variety solely cookin...  positive
freq                                                    5     25000


In [79]:
print(bad.describe())

                                                     text    target
count                                               25000     25000
unique                                              24696         1
top     When got movie free job, along three similar m...  negative
freq                                                    3     25000


In [80]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= good.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print (sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('film', 50894), ('the', 49203), ('movi', 44850), ('it', 32237), ('one', 28290), ('like', 20562), ('thi', 18000), ('time', 16630), ('good', 15262), ('see', 15141)]


In [81]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= bad.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print (sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('movi', 58431), ('the', 49707), ('film', 44988), ('it', 31346), ('one', 27163), ('like', 24648), ('thi', 19196), ('make', 16221), ('even', 15440), ('time', 15335)]


## Define sets

In [85]:
data =  pd.concat([sentiment140, moviereview], ignore_index=True)

X_train = data.iloc[:, 0]
y_train = data.iloc[:, 1]

climatechange_transf = climatechange.dropna()

X_test = climatechange_transf.iloc[:, 0]

y_test = climatechange_transf.iloc[:, 2]
y_test = y_test.apply({'N':'negative', 'Y':'positive', 'No': 'negative', 'Yes':'positive'}.get)


In [86]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

# FROM - https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn
def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

In [87]:
data_small = data.sample(10000)
print(data_small)

                                                      text    target
401262        @martinbrossman Sorry hear Martin Give best!  negative
232101   Just made felt Cornish pasty: [link] And work ...  negative
1492582  @amigastu see there,i buy one online money mon...  positive
1327779  ???????? ??? (???????) gorgeous catchy song! B...  positive
967318                  @jaeho9kim GM fosho... wait, late!  positive
...                                                    ...       ...
244566   Damn....I soo c staying n home, stop n parents...  negative
746773   @RushByTor2112 lol knoww... anyways emily im s...  negative
1104329            @AdrianneCurry Yes. You definitely are.  positive
1261055  @evila_elf lucky you, then. I always seem get ...  positive
118792                     @JadinShropshire somebody sick?  negative

[10000 rows x 2 columns]


In [88]:
svc = Pipeline([('vect', CountVectorizer(analyzer=stemmed_words)),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC(C=0.1, tol=0.1)),
        ])

svc.fit(data_small.iloc[:,0],data_small.iloc[:,1])


y_pred = svc.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.6945306902316695


In [58]:
from sklearn.model_selection import GridSearchCV

param = {
    'vect__analyzer': [stemmed_words],
    'tfidf__norm': ['l2'],
    'clf__tol':[1e-1, 1e-2, 1e-3, 1e-4, 1e-5], 
    'clf__C':[100, 50, 20, 10, 1, 1e-1, 1e-2, 1e-3]
    }

svc = Pipeline([('vect', CountVectorizer(analyzer=stemmed_words)),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC()),
        ])

clf = GridSearchCV(svc, param, cv=5)

clf.fit(data_small.iloc[:,0],data_small.iloc[:,1])


sorted(clf.cv_results_.keys())






['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_clf__C',
 'param_clf__tol',
 'param_tfidf__norm',
 'param_vect__analyzer',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [60]:
print('Best Params: ', clf.best_params_)

Best Params:  {'clf__C': 0.1, 'clf__tol': 0.01, 'tfidf__norm': 'l2', 'vect__analyzer': <function stemmed_words at 0x150652950>}
