# IFT6390 Project
### Jordi

## Import data and libraries

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 

sentiment140 = pd.read_csv('data/sentiment140.csv')
climatechange = pd.read_csv('data/climatechange.csv')
moviereview = pd.read_csv('data/moviereview.csv')

## Sentiment140 Analysis

In [3]:
sentiment140.shape

(1600498, 2)

In [14]:
sentiment140.head()

Unnamed: 0,text,target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative
1,"Got a headache :/ MC stop making music, you ca...",negative
2,lol still worked like crazy lol . lol Your la...,negative
3,why won't netflix send me S. Darko? I know it'...,negative
4,[ToZ] Clan Website offline http://www.theoutl...,negative


In [5]:
sentiment140['target'].value_counts(normalize=True)

positive    0.499958
negative    0.499955
neutral     0.000087
Name: target, dtype: float64

In [17]:
good = sentiment140[(sentiment140.target == 'positive')]
bad = sentiment140[(sentiment140.target == 'negative')]

In [18]:
print(good.describe())

                 text    target
count          800182    800182
unique         793688         1
top     good morning   positive
freq              118    800182


In [19]:
print(bad.describe())

                             text    target
count                      800177    800177
unique                     790362         1
top     isPlayer Has Died! Sorry   negative
freq                          210    800177


In [20]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= good.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('the', 266102), ('to', 252777), ('you', 198324), ('it', 166808), ('and', 149674), ('my', 126021), ('for', 117396), ('is', 109926), ('in', 101206), ('of', 91115)]


In [21]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= bad.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('to', 313251), ('the', 257969), ('my', 190871), ('it', 182782), ('and', 154018), ('is', 128505), ('in', 115533), ('you', 103929), ('for', 99011), ('me', 93186)]


## ClimateChange Analysis

In [6]:
climatechange.shape

(6027, 3)

In [7]:
climatechange.columns

Index(['text', 'confidence', 'target'], dtype='object')

In [8]:
climatechange.head()

Unnamed: 0,text,confidence,target
0,Global warming report urges governments to act...,1.0,Yes
1,Fighting poverty and global warming in Africa ...,1.0,Yes
2,Carbon offsets: How a Vatican forest failed to...,0.8786,Yes
3,Carbon offsets: How a Vatican forest failed to...,1.0,Yes
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,0.8087,Yes


In [37]:
climatechange['target'].value_counts(normalize=True)

Y      0.605207
N      0.248627
Yes    0.132314
No     0.013852
Name: target, dtype: float64

In [3]:
exist = climatechange[(climatechange.target == 'Y') | (climatechange.target == 'Yes')]
not_exist = climatechange[(climatechange.target == 'N') | (climatechange.target == 'No')]


In [11]:
print(exist.describe())

        confidence
count  3088.000000
mean      0.821351
std       0.178079
min       0.343400
25%       0.662800
50%       0.806050
75%       1.000000
max       1.000000


In [12]:
print(not_exist.describe())

        confidence
count  1099.000000
mean      0.762216
std       0.190782
min       0.345100
25%       0.650100
50%       0.688000
75%       1.000000
max       1.000000


In [5]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= exist.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('http', 2101), ('climat', 2015), ('chang', 1902), ('global', 1561), ('warm', 1506), ('ly', 1294), ('the', 1221), ('bit', 1184), ('to', 1057), ('of', 836)]


In [4]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= not_exist.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('global', 913), ('warm', 906), ('http', 602), ('the', 520), ('climat', 368), ('ly', 354), ('bit', 325), ('is', 325), ('to', 325), ('chang', 323)]


## MovieReview Analysis

In [14]:
moviereview.shape

(50000, 2)

In [15]:
moviereview.head()

Unnamed: 0,text,target
0,Story of a man who has unnatural feelings for ...,negative
1,Airport '77 starts as a brand new luxury 747 p...,negative
2,This film lacked something I couldn't put my f...,negative
3,"Sorry everyone,,, I know this is supposed to b...",negative
4,When I was little my parents took me along to ...,negative


In [16]:
moviereview['target'].value_counts(normalize=True)

positive    0.5
negative    0.5
Name: target, dtype: float64

In [6]:
good = moviereview[(moviereview.target == 'positive')]
bad = moviereview[(moviereview.target == 'negative')]


In [18]:
print(good.describe())

                                                     text    target
count                                               25000     25000
unique                                              24884         1
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000


In [19]:
print(bad.describe())

                                                     text    target
count                                               25000     25000
unique                                              24698         1
top     You do realize that you've been watching the E...  negative
freq                                                    3     25000


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= good.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print (sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('the', 341282), ('and', 176642), ('of', 152119), ('to', 131342), ('is', 111830), ('it', 104139), ('in', 99284), ('br', 97954), ('that', 69850), ('thi', 69663)]


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english')

data= bad.iloc[:,0].ravel()
transformed_data =vectorizer.fit_transform(data)
vocab= {a: b for a, b in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))}

print (sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10])

[('the', 326712), ('and', 147816), ('of', 137338), ('to', 136807), ('br', 103997), ('it', 102780), ('is', 99252), ('in', 87577), ('thi', 81367), ('that', 74704)]


## Define sets

In [91]:
data =  pd.concat([sentiment140, moviereview], ignore_index=True)

X_train = data.iloc[:, 0]
y_train = data.iloc[:, 1]

climatechange_transf = climatechange.dropna()

X_test = climatechange_transf.iloc[:, 0]

y_test = climatechange_transf.iloc[:, 2]
y_test = y_test.apply({'N':'negative', 'Y':'positive', 'No': 'negative', 'Yes':'positive'}.get)


In [92]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from nltk.stem import PorterStemmer 

analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer() 

# FROM - https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn
def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))

In [93]:
data_small = data.sample(10000)
print(data_small)

                                                      text    target
1508450  @subray  'subray' so are you 'ray' ? sounds ve...  positive
745323   @SSjUmi oh *hug* yes i like that song. I would...  negative
246855   @RochelleVeturis was my slavedriver at the bab...  negative
550541                                         Am racit...  negative
...                                                    ...       ...
638026             y is it I can't sleep in on my day off?  negative
303321   continuing my diet by drinking this Propel Fit...  negative
350672   @gimmeapuck HAHA I'm still laughing at that. A...  negative
693791                      @amber_boyd Don't count on it   negative
229703   @brittsterbabe27 right on haha. kennywood? per...  negative

[10000 rows x 2 columns]


In [95]:
svc = Pipeline([('vect', CountVectorizer(analyzer=stemmed_words)),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC(C=0.1)),
        ])

svc.fit(data_small.iloc[:,0],data_small.iloc[:,1])


y_pred = svc.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.7231908287556723
