In [1]:
# installing kaggle library
!pip install kaggle



In [3]:
# configuring the path of kaggle.json file

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Importing Data

In [4]:
# importing twitter sentiment dataset

# api to fetch the dataset from kaggle

!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 98% 79.0M/80.9M [00:00<00:00, 162MB/s]
100% 80.9M/80.9M [00:00<00:00, 131MB/s]


In [5]:
# extracting the compressed dataset

from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [6]:
# importing the dependencies

import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
# printing the stopwords in english

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# **Data Processing**

In [9]:
# Data processing

# loading the data from csv file to pandas dataframe

twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

In [10]:
# checking the number of rows and columns

twitter_data.shape

(1599999, 6)

In [11]:
# printing first 10 rows of the data frame

twitter_data.head(10)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
5,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
6,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
7,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
8,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?
9,0,1467812416,Mon Apr 06 22:20:16 PDT 2009,NO_QUERY,erinx3leannexo,spring break in plain city... it's snowing


In [13]:
# naming the columns and reading the dataset again

columns_name = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data.columns = columns_name

twitter_data.head(10)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
5,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
6,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
7,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
8,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?
9,0,1467812416,Mon Apr 06 22:20:16 PDT 2009,NO_QUERY,erinx3leannexo,spring break in plain city... it's snowing


In [14]:
# counting the number of missing values in the dataset

twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [15]:
# checking the distribution of target column

twitter_data.target.value_counts()

target
4    800000
0    799999
Name: count, dtype: int64

In [16]:
# convert the target 4 to 1

twitter_data.replace({'target' : {4 : 1}}, inplace=True)

In [17]:
twitter_data.target.value_counts()

target
1    800000
0    799999
Name: count, dtype: int64

# **Stemming**

In [18]:
# 0 -> negative tweet
# 1 -> possitive tweet

# stemming - process of reducing the word to it's key/ root word

port_stem = PorterStemmer()

In [19]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [20]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [21]:
twitter_data.head(10)

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,kwesidei whole crew
5,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug,need hug
6,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...,loltrish hey long time see ye rain bit bit lol...
7,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it,tatiana k nope
8,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?,twittera que muera
9,0,1467812416,Mon Apr 06 22:20:16 PDT 2009,NO_QUERY,erinx3leannexo,spring break in plain city... it's snowing,spring break plain citi snow


In [22]:
print(twitter_data['stemmed_content'])

0          upset updat facebook text might cri result sch...
1          kenichan dive mani time ball manag save rest g...
2                            whole bodi feel itchi like fire
3                              nationwideclass behav mad see
4                                        kwesidei whole crew
                                 ...                        
1599994                           woke school best feel ever
1599995    thewdb com cool hear old walt interview http b...
1599996                         readi mojo makeov ask detail
1599997    happi th birthday boo alll time tupac amaru sh...
1599998    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1599999, dtype: object


In [23]:
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599994    1
1599995    1
1599996    1
1599997    1
1599998    1
Name: target, Length: 1599999, dtype: int64


In [24]:
# seperating the data and label

X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [25]:
print(X)

['upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound'
 'whole bodi feel itchi like fire' ... 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [26]:
print(Y)

[0 0 0 ... 1 1 1]


In [27]:
# splitting the data to training data and test data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [28]:
print(X.shape, X_train.shape, X_test.shape)

(1599999,) (1279999,) (320000,)


In [29]:
# converting textual data to neumerical data

vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [30]:
print(X_train)

  (0, 443035)	0.44770025018080273
  (0, 436655)	0.27247657460069113
  (0, 354566)	0.358082129508793
  (0, 234920)	0.4196620311217523
  (0, 185125)	0.5296934860731303
  (0, 109298)	0.37470892870977707
  (1, 319721)	0.3128279791558143
  (1, 172343)	0.3785321144028949
  (1, 166826)	0.3977368489124923
  (1, 152015)	0.49196931362728563
  (1, 119689)	0.4309123985667578
  (1, 86730)	0.33015139495167095
  (1, 19762)	0.25286013616648123
  (2, 445635)	0.23563700521896036
  (2, 412554)	0.28961919890983556
  (2, 326288)	0.5853283798404599
  (2, 279017)	0.2849974993622606
  (2, 220710)	0.2626149875630723
  (2, 146003)	0.2213048673720909
  (2, 124728)	0.33175938915396974
  (2, 117914)	0.3986115507964782
  (2, 93826)	0.2233049434503089
  (3, 349661)	0.30875649823234047
  (3, 331805)	0.4656145237310222
  (3, 93826)	0.46202690964703447
  :	:
  (1279996, 85239)	0.34250398106721314
  (1279996, 29129)	0.37407281062427566
  (1279996, 13878)	0.4358882391561783
  (1279997, 454559)	0.25440927930600654
  (1279

In [31]:
print(X_test)

  (0, 414033)	0.2624690535388269
  (0, 382367)	0.25411151934588255
  (0, 375613)	0.24658637821166526
  (0, 371332)	0.39726815450904773
  (0, 355868)	0.22363572756759692
  (0, 320624)	0.5238287544743888
  (0, 219133)	0.32954550888877443
  (0, 127038)	0.27309950334655736
  (0, 75261)	0.2655213811047243
  (0, 63768)	0.2642268258978504
  (1, 366259)	0.24522188688615765
  (1, 348128)	0.473720279763542
  (1, 256686)	0.28613491195617663
  (1, 217431)	0.4026080739142293
  (1, 145408)	0.5774478727480552
  (1, 15098)	0.21044041574811614
  (1, 6452)	0.30619872532418485
  (2, 438558)	0.3705789304302396
  (2, 436090)	0.22644701784779067
  (2, 418804)	0.33257344872506
  (2, 407333)	0.27060796399382786
  (2, 375890)	0.3020734075258265
  (2, 258609)	0.3072614985829275
  (2, 124443)	0.2729825984376057
  (2, 107866)	0.36857211925047617
  :	:
  (319995, 307063)	0.3842384782822585
  (319995, 193560)	0.373116119782695
  (319995, 178088)	0.30764903933281135
  (319995, 70151)	0.3246818230890128
  (319995, 21

# Training

In [33]:
# training the ML model

# logistic regression

model = LogisticRegression(max_iter=1000)

In [34]:
model.fit(X_train, Y_train)

# Model Evalution

In [35]:
# Model Evaluation

# Accuracy Score

# accuracy score on the training data

X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [36]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.8102912580400453


In [37]:
# accuracy score on the testing data

X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [38]:
print('Accuracy score of the testing data : ', testing_data_accuracy)

Accuracy score of the testing data :  0.777909375


# Saving the Trained Model

In [39]:
# saving the trained model

import pickle

filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

# Loading the Saved model to make predictions

In [40]:
# using the saved model for future predictions

# loading the saved model

loaded_model = pickle.load(open('/content/trained_model.sav', 'rb'))

In [41]:
X_new = X_test[200]

prediction = loaded_model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The tweet is negative')
else:
  print('The tweet is positive')

[1]
The tweet is positive


# Testing Custom Tweets

In [45]:
# Testing the loaded model with my custom tweet

custom_tweet = input('Enter the tweet : ')

# stemming the custom_tweet

stemmed_custom_tweet = stemming(custom_tweet)

# converting the stemmed_custom_tweet to numerical data

input_data = vectorizer.transform([stemmed_custom_tweet])

# making the prediction

prediction = loaded_model.predict(input_data)
print(prediction)

if (prediction[0]==0):
  print('The tweet is negative')
else:
  print('The tweet is positive')

Enter the tweet : you are mad
[0]
The tweet is negative
