In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [4]:
column_names = ['target','id','date','flag','user','text']
df = pd.read_csv("twitter dataset.csv",names=column_names,encoding="ISO-8859-1")
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df.shape

(1600000, 6)

In [6]:
df.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [7]:
##checking the distribution of target columns:-
df['target'].value_counts()

0    800000
4    800000
Name: target, dtype: int64

In [8]:
##convert the target 4 to 1
df['target']=df['target'].replace({4:1})

In [9]:
df['target'].value_counts()

0    800000
1    800000
Name: target, dtype: int64

In [10]:
port_stem = PorterStemmer()

In [11]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    
    return stemmed_content

In [12]:
df['stemmed_content'] = df['text'].apply(stemming)

In [13]:
df.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [14]:
X = df['stemmed_content'].values
y = df['target'].values

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=2,stratify=y,random_state=2)

In [18]:
##Converting the textual data to numerical data
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [19]:
print(X_train)

  (0, 71402)	0.6434834452732276
  (0, 30878)	0.6647397514428064
  (0, 144958)	0.3795393504118552
  (1, 514377)	0.323955764318329
  (1, 172873)	0.2876152358216851
  (1, 503320)	0.4586743847404938
  (1, 443975)	0.37877538419659235
  (1, 14402)	0.4665381196801877
  (1, 263365)	0.49073347031060754
  (2, 344970)	0.28756929154859073
  (2, 148320)	0.2726818125079846
  (2, 49668)	0.22240091123396277
  (2, 390998)	0.36789687517816305
  (2, 149290)	0.295438688674672
  (2, 36236)	0.2186768608770192
  (2, 450826)	0.3534630989917906
  (2, 130476)	0.24653213102781832
  (2, 271004)	0.17122607891418495
  (2, 150236)	0.1918437526798713
  (2, 447362)	0.37812810692443355
  (2, 164041)	0.3580742277605795
  (3, 183984)	0.33388162413438544
  (3, 90966)	0.31048853684210265
  (3, 74511)	0.48941493717892465
  (3, 476056)	0.31846481190359344
  :	:
  (1599993, 33422)	0.28256817482221724
  (1599993, 462403)	0.22320071189890286
  (1599994, 398879)	0.7465598882481671
  (1599994, 384028)	0.39154079710219314
  (15999

In [20]:
print(X_test)

  (0, 474893)	0.2222455963286612
  (0, 452118)	0.3379210147373026
  (0, 309172)	0.23192102250118488
  (0, 205312)	0.23988567380585699
  (0, 180311)	0.28861329372313765
  (0, 144958)	0.3129958513819607
  (0, 77838)	0.6580017941831878
  (0, 33852)	0.332946524372629
  (1, 514377)	0.14970637544359747
  (1, 500320)	0.37271205340798025
  (1, 476056)	0.3679092625440984
  (1, 428319)	0.29623149973589263
  (1, 380895)	0.3649996799494441
  (1, 342968)	0.29923116174660547
  (1, 179425)	0.33259329694201395
  (1, 144958)	0.22238056885389892
  (1, 118501)	0.4045922616554632
  (1, 100356)	0.2627413713235587


# Apply LogisticRegression:-

In [21]:
model = LogisticRegression(max_iter = 1000)

In [22]:
model.fit(X_train,y_train)

LogisticRegression(max_iter=1000)

In [23]:
##accuracy score on the training data
y_train_prediction = model.predict(X_train)
acc_train = accuracy_score(y_train,y_train_prediction)
acc_train

0.8095166368957961

In [24]:
##accuracy score on the testing data
y_test_prediction = model.predict(X_test)
acc_test = accuracy_score(y_test,y_test_prediction)
acc_test

1.0

In [29]:
X_new = X_test[1]
print("Original Value:- ",y_test[1])

prediction = model.predict(X_new)
print("Prediction Value:- ",prediction)
if (prediction[0] == 0):
    print("Negative Tweet")
else:
    print("Positive Tweet")

Original Value:-  1
Prediction Value:-  [1]
Positive Tweet
