## Train a classification model to identify the sentiment of tweets in the given dataset

In [3]:
import pandas as pd

df = pd.read_csv('https://s3.eu-west-1.amazonaws.com/neueda.conygre.com/pydata/ml_fc/tweets_12000.csv',  encoding='ISO-8859-1', header=None)

display(df.head())
df.shape


Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


(12000, 6)

In [5]:
df.columns = ['sentiment', 'use_id', 'date', 'query', 'sender', 'tweet']

In [6]:
df.head()

Unnamed: 0,sentiment,use_id,date,query,sender,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
df['sentiment'].unique()

array([0, 4], dtype=int64)

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [10]:
from nltk.corpus import stopwords

stopwordsSet = set(stopwords.words('English'))

stopwordsSet

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [11]:
import re
 
def prepare_review(review):
 
    review = review.lower()
    review = review.split()
 
    review_words = []
 
    for word in review:
 
        if word.startswith('@') or word.startswith('http'):
            continue
 
        word = re.sub('[^a-zA-Z]', ' ', word)
 
        if not word in stopwordsSet:
            review_words.append(word)
 
    review = ' '.join(review_words)
 
    return review

In [12]:
corpus = list(map(prepare_review, df['tweet']))
corpus

['  awww  that s bummer  shoulda got david carr third day it   d',
 'upset can t update facebook texting it    might cry result school today also  blah ',
 'dived many times ball  managed save     rest go bounds',
 'whole body feels itchy like fire',
 'no  it s behaving all  i m mad  here  can t see there ',
 'whole crew',
 'need hug',
 'hey long time see  yes   rains bit  only bit lol   i m fine thanks   how s  ',
 'nope didn t',
 'que muera  ',
 'spring break plain city    it s snowing',
 're pierced ears',
 'couldn t bear watch it  thought ua loss embarrassing          ',
 'counts  idk either  never talk anymore',
 'would ve first  didn t gun  really though  zac snyder s doucheclown ',
 'wish got watch you   miss premiere  ',
 'hollis  death scene hurt severely watch film wry directors cut now ',
 'file taxes',
 'ahh ive always wanted see rent love soundtrack  ',
 'oh dear  drinking forgotten table drinks ',
 'day didn t get much done',
 'one friend called me  asked meet mid valley 

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500, stop_words='english')

X = cv.fit_transform(corpus).toarray()

X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
y = df['sentiment'].values

y

array([0, 0, 0, ..., 4, 4, 4], dtype=int64)

In [18]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [22]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
new_tweet = 'great day had a beautiful cruffin for breakfast'
new_tweet = 'i take no pleasure reporting this, but scientists have found a crazier frog'

prepared_new_tweet = prepare_review(new_tweet)

feature_set = cv.transform( [prepared_new_tweet] ) 

model.predict_proba(feature_set.toarray())

array([[0.42266343, 0.57733657]])