In [1]:
import pandas as pd
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']

df = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', names=column_names)

In [3]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df.shape

(1600000, 6)

In [6]:
df = df[['target', 'text']]

In [7]:
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [8]:
df.isnull().sum()

target    0
text      0
dtype: int64

In [9]:
df.target.unique()

array([0, 4], dtype=int64)

In [10]:
df.target = df.target.replace(4, 1)

In [11]:
df.target.unique()

array([0, 1], dtype=int64)

In [12]:
df.target.value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [13]:
port_stem = PorterStemmer()

In [13]:
def clean_content(content):
    content = re.sub('[^a-zA-z]', ' ', content)
    content = content.lower()
    content = content.split()
    content = [port_stem.stem(word) for word in content]
    content = ' '.join(content)
    
    return content

In [14]:
df.text[0]

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [15]:
clean_content(df.text[0])

'switchfoot http twitpic com y zl awww that s a bummer you shoulda got david carr of third day to do it d'

In [16]:
df['clean_content'] = df.text.apply(clean_content)

In [17]:
df.head()

Unnamed: 0,target,text,clean_content
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com y zl awww that s a...
1,0,is upset that he can't update his Facebook by ...,is upset that he can t updat hi facebook by te...
2,0,@Kenichan I dived many times for the ball. Man...,kenichan i dive mani time for the ball manag t...
3,0,my whole body feels itchy and like its on fire,my whole bodi feel itchi and like it on fire
4,0,"@nationwideclass no, it's not behaving at all....",nationwideclass no it s not behav at all i m m...


In [18]:
X = df.clean_content.values
y = df.target.values

In [19]:
print(X)

['switchfoot http twitpic com y zl awww that s a bummer you shoulda got david carr of third day to do it d'
 'is upset that he can t updat hi facebook by text it and might cri as a result school today also blah'
 'kenichan i dive mani time for the ball manag to save the rest go out of bound'
 ... 'are you readi for your mojo makeov ask me for detail'
 'happi th birthday to my boo of alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [20]:
print(y)

[0 0 0 ... 1 1 1]


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=1)

In [22]:
print(X.shape, X_train.shape, X_test.shape)

(1600000,) (1280000,) (320000,)


In [23]:
print(X_train)

['rise and shine lol i m up a few min later then plan but no rush i ll be out the door in time'
 'had a nd interview today it s look promis'
 'emilyalbracht i feel your pain' ...
 'bookwitt your welcom if you chang your mind though let me know'
 'howcoza you bet i will bring backup'
 'in window on a linux box instal bsd to a appl ipod yeah']


In [24]:
print(X_test)

['would like to have even a littl bit of time off work lay out until then work'
 'look forward to keep in touch with naomi it ha been a long time sinc we chat'
 'nbatvandr no in germani the nba ha no airtim but i got the intern lp so i will watch the game for sure'
 ...
 'sweet_pea darn i will think of you on our journey so it s like you are realli there'
 'oh my calvin harri call me a daft bastard i feel so cool thi is my new claim to fame'
 'go to a famili meal in hour i ll cheer up my grandpa']


In [25]:
vectorizer = TfidfVectorizer(stop_words='english')

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [35]:
print(X_train)

  (0, 429536)	0.19789519537661035
  (0, 113663)	0.3616149854852959
  (0, 251211)	0.22087097049499688
  (0, 365691)	0.40301099301769566
  (0, 334567)	0.29552399599280776
  (0, 240082)	0.28352924830413395
  (0, 280625)	0.3269786575970019
  (0, 252430)	0.20375410615269232
  (0, 383440)	0.36858295006027136
  (0, 358953)	0.4084884473203552
  (1, 341401)	0.5346690828578055
  (1, 253207)	0.32751242435459643
  (1, 432063)	0.2890717724957497
  (1, 190696)	0.5246708950552977
  (1, 301537)	0.49801862009106257
  (2, 323182)	0.4125612507585606
  (2, 137797)	0.2779132307719751
  (2, 125812)	0.8675006919503955
  (3, 17947)	0.4516026326901516
  (3, 258028)	0.24761566182179068
  (3, 426521)	0.12741880567710315
  (3, 174832)	0.30394304849435605
  (3, 17927)	0.1523903947867367
  (3, 133566)	0.3014016904244015
  (3, 460593)	0.19172852890163558
  :	:
  (1279995, 59373)	0.6306014433092907
  (1279995, 171786)	0.5039716005052342
  (1279995, 48762)	0.38648729503344126
  (1279995, 432331)	0.37285190899144544
  

In [26]:
print(X_test)

  (0, 467337)	0.5472066583483761
  (0, 429536)	0.29221978906086066
  (0, 249743)	0.3851104053792675
  (0, 246960)	0.2764796623304958
  (0, 241689)	0.5114695199057603
  (0, 46947)	0.3589155473562436
  (1, 434842)	0.3689547766926422
  (1, 429536)	0.2096405111277678
  (1, 387646)	0.3046200242634409
  (1, 299393)	0.5580222197891781
  (1, 253207)	0.23444039563801244
  (1, 252985)	0.2760473009086092
  (1, 165307)	0.2365728710055325
  (1, 144378)	0.3174184848432791
  (1, 72675)	0.3575687942581541
  (2, 458166)	0.16295247512227945
  (2, 409671)	0.19831544486019267
  (2, 301246)	0.4806418255470049
  (2, 301229)	0.32419737455875053
  (2, 255881)	0.3654675090676226
  (2, 190565)	0.30561495811941386
  (2, 165307)	0.1673776444978135
  (2, 160074)	0.15053061725722386
  (2, 153860)	0.2858612607763294
  (2, 150461)	0.20795864888157603
  :	:
  (319996, 459061)	0.6505477996122063
  (319996, 433147)	0.4563551206239166
  (319996, 184331)	0.6070647117894821
  (319997, 426737)	0.31497856661116713
  (319997,

In [36]:
model = LogisticRegression(max_iter=1000)

In [37]:
model.fit(X_train, y_train)

In [39]:
print("Train accuracy: ", model.score(X_train, y_train))
print("Test accuracy: ", model.score(X_test, y_test))

Train accuracy:  0.81224453125
Test accuracy:  0.780421875
