# IFT6390 Project
### Jordi

## Import data and libraries

In [6]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 

stop_words = set(stopwords.words('english')) 

sentiment140 = pd.read_csv('data/sentiment140.csv')
climatechange = pd.read_csv('data/climatechange.csv')
moviereview = pd.read_csv('data/moviereview.csv')

## Sentiment140 Analysis

In [3]:
sentiment140.shape

(1600498, 2)

In [2]:
sentiment140.head()

Unnamed: 0,text,target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative
1,"Got a headache :/ MC stop making music, you ca...",negative
2,lol still worked like crazy lol . lol Your la...,negative
3,why won't netflix send me S. Darko? I know it'...,negative
4,[ToZ] Clan Website offline http://www.theoutl...,negative


In [23]:
# SRC -> https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
text = sentiment140.iloc[:, 0]

sentiment140.iloc[:, 0] = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [24]:
sentiment140.head()

Unnamed: 0,text,target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative
1,"Got headache :/ MC stop making music, can't si...",negative
2,lol still worked like crazy lol . lol Your lak...,negative
3,netflix send S. Darko? I know going terrible I...,negative
4,[ToZ] Clan Website offline http://www.theoutla...,negative


In [25]:
sentiment140['target'].value_counts(normalize=True)

positive    0.499958
negative    0.499955
neutral     0.000087
Name: target, dtype: float64

In [26]:
good = sentiment140[(sentiment140.target == 'positive')]
bad = sentiment140[(sentiment140.target == 'negative')]

In [27]:
print(good.describe())

                text    target
count         800182    800182
unique        791999         1
top     good morning  positive
freq             131    800182


In [28]:
print(bad.describe())

                            text    target
count                     800177    800177
unique                    786612         1
top     isPlayer Has Died! Sorry  negative
freq                         210    800177


In [29]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= good.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('good', 62861), ('love', 60870), ('go', 59310), ('day', 55817), ('thank', 51085), ('it', 50859), ('get', 49210), ('http', 47298), ('quot', 47000), ('you', 45390)]


In [30]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= bad.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('go', 79300), ('get', 61516), ('work', 59033), ('day', 50489), ('it', 48261), ('miss', 47577), ('like', 43264), ('want', 40275), ('today', 38370), ('feel', 36921)]


## ClimateChange Analysis

In [6]:
climatechange.shape

(6027, 3)

In [7]:
climatechange.columns

Index(['text', 'confidence', 'target'], dtype='object')

In [8]:
climatechange.head()

Unnamed: 0,text,confidence,target
0,Global warming report urges governments to act...,1.0,Yes
1,Fighting poverty and global warming in Africa ...,1.0,Yes
2,Carbon offsets: How a Vatican forest failed to...,0.8786,Yes
3,Carbon offsets: How a Vatican forest failed to...,1.0,Yes
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,0.8087,Yes


In [31]:
# SRC -> https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
text = climatechange.iloc[:, 0]

climatechange.iloc[:, 0] = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [32]:
climatechange['target'].value_counts(normalize=True)

Y      0.605207
N      0.248627
Yes    0.132314
No     0.013852
Name: target, dtype: float64

In [33]:
exist = climatechange[(climatechange.target == 'Y') | (climatechange.target == 'Yes')]
not_exist = climatechange[(climatechange.target == 'N') | (climatechange.target == 'No')]


In [34]:
print(exist.describe())

        confidence
count  3088.000000
mean      0.821351
std       0.178079
min       0.343400
25%       0.662800
50%       0.806050
75%       1.000000
max       1.000000


In [35]:
print(not_exist.describe())

        confidence
count  1099.000000
mean      0.762216
std       0.190782
min       0.345100
25%       0.650100
50%       0.688000
75%       1.000000
max       1.000000


In [36]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= exist.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('http', 2101), ('climat', 2015), ('chang', 1902), ('global', 1561), ('warm', 1506), ('ly', 1294), ('bit', 1184), ('link', 523), ('rt', 522), ('com', 328)]


In [37]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= not_exist.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('global', 913), ('warm', 906), ('http', 602), ('climat', 368), ('ly', 354), ('bit', 325), ('chang', 323), ('rt', 229), ('snow', 155), ('the', 132)]


## MovieReview Analysis

In [14]:
moviereview.shape

(50000, 2)

In [15]:
moviereview.head()

Unnamed: 0,text,target
0,Story of a man who has unnatural feelings for ...,negative
1,Airport '77 starts as a brand new luxury 747 p...,negative
2,This film lacked something I couldn't put my f...,negative
3,"Sorry everyone,,, I know this is supposed to b...",negative
4,When I was little my parents took me along to ...,negative


In [38]:
# SRC -> https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
text = moviereview.iloc[:, 0]

moviereview.iloc[:, 0] = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [39]:
moviereview['target'].value_counts(normalize=True)

positive    0.5
negative    0.5
Name: target, dtype: float64

In [40]:
good = moviereview[(moviereview.target == 'positive')]
bad = moviereview[(moviereview.target == 'negative')]


In [41]:
print(good.describe())

                                                     text    target
count                                               25000     25000
unique                                              24883         1
top     Loved today's show!!! It variety solely cookin...  positive
freq                                                    5     25000


In [42]:
print(bad.describe())

                                                     text    target
count                                               25000     25000
unique                                              24698         1
top     Nickelodeon gone toilet. They kids saying thin...  negative
freq                                                    3     25000


In [43]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= good.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print (sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('br', 97954), ('film', 50897), ('the', 49355), ('movi', 44851), ('it', 32281), ('one', 28290), ('like', 20562), ('thi', 18049), ('time', 16631), ('good', 15262)]


In [44]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= bad.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print (sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('br', 103997), ('movi', 58431), ('the', 49904), ('film', 44990), ('it', 31388), ('one', 27163), ('like', 24648), ('thi', 19255), ('make', 16221), ('even', 15440)]


## Define sets

In [45]:
data =  pd.concat([sentiment140, moviereview], ignore_index=True)

X_train = data.iloc[:, 0]
y_train = data.iloc[:, 1]

climatechange_transf = climatechange.dropna()

X_test = climatechange_transf.iloc[:, 0]

y_test = climatechange_transf.iloc[:, 2]
y_test = y_test.apply({'N':'negative', 'Y':'positive', 'No': 'negative', 'Yes':'positive'}.get)


In [46]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

# FROM - https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn
def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

In [47]:
data_small = data.sample(10000)
print(data_small)

                                                      text    target
939957                                       sunny day yet  positive
43064        fanna head bac brockett cant wait mistied yal  negative
1014256  @sunnybrooke1207 Thanks concern, I'm great! la...  positive
1001639                                           showerd!  positive
830805   @Natashaax wait got black dress! know wont car...  positive
...                                                    ...       ...
968789   Yayyy! Got phone back! They kept SIM card &amp...  positive
1115662         @MyCatIsOnFire suppose I'm poor uneducated  positive
832707   @maxchangmin JaeJoong, Please read this!! http...  positive
754637   @Ramiii Rami! u guys schedule Nile FM's new sh...  negative
1332558  Bom dia The Colliding Spiral Galaxies Arp 274 ...  positive

[10000 rows x 2 columns]


In [48]:
svc = Pipeline([('vect', CountVectorizer(analyzer=stemmed_words)),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC(C=0.1)),
        ])

svc.fit(data_small.iloc[:,0],data_small.iloc[:,1])


y_pred = svc.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.6560783377119656
