In [49]:
# installing kaggle library
! pip install kaggle



Upload your kaggle.json file

In [50]:
# configuration the path of kaggle.json file
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


Importing Twitter Sentiment Dataset

In [51]:
# API to fetch the dataset from Kaggle
!kaggle datasets download -d kazanova/sentiment140


Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [52]:
# Extracting the compressed dataset
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


Importing the Dependencies

In [53]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [54]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
# Printing the Stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Processing

In [56]:
# Loading the data from csv file to pandas dataframe
data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

In [57]:
# Checking gthe number of rows and columns
data.shape

(1599999, 6)

In [58]:
# Printing the first 5 rows of the dataframe
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [59]:
# Naming the column and Reading the dataset again
column_names=['target','ids','date','flag','user','text']
data=pd.read_csv('/content/training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1',names=column_names)

In [60]:
# Checking gthe number of rows and columns
data.shape

(1600000, 6)

In [61]:
# Printing the first 5 rows of the dataframe
data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [62]:
# Counting the number of missing values in the dataset
data.isnull().sum()

Unnamed: 0,0
target,0
ids,0
date,0
flag,0
user,0
text,0


In [63]:
# Checking the distribution of target column
data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


Convert the target from "4" to "1"

In [64]:
data.replace({'target':{4:1}},inplace=True)

In [65]:
# Checking the distribution of target column
data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


 0-->Negative Tweet
 1-->Positive Tweet









***Stemming***

Stemming is the process of reducing a word to its Root Word





example : actor,actress, acting=act

In [66]:
port=PorterStemmer()

In [67]:
# checking the comments by (removing symbols and numbers, converting them in lowercase, split and then check each word is in stopwords list to check each is a valid word or not then join the words ...in the end you will have a proper words tweet)
def stemming(content):
  stemmed_content=re.sub('[^a-zA-Z]',' ',content)
  stemmed_content=stemmed_content.lower()
  stemmed_content=stemmed_content.split()
  stemmed_content=[port.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content=' '.join(stemmed_content)
  return stemmed_content

In [None]:
data['streammed_content']=data['text'].apply(stemming)

In [None]:
data.head()

In [None]:
print(data['streammed_content'])

In [None]:
print(data['target'])

In [None]:
# Seprating the data and label
X=data['streammed_content'].values
Y=data['target'].values
print(X)
print(Y)

Splitting the data into Training Data and Test Data

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)
print("Shape of X : ",X.shape,"\n","Shape of testing data of X : ",X_test.shape,"\n","Shape of training data of X : ",X_train.shape,"\n\n")
print("Shape of Y : ",Y.shape,"\n","Shape of testing data of Y : ",Y_test.shape,"\n","Shape of training data of Y : ",Y_train.shape,"\n\n")
print("Training data of X : ","\n",X_train,"\n\n")

In [None]:
# converting the textual data to numerical data
vectorizer=TfidfVectorizer()
X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [None]:
print(X_train)

In [None]:
print(X_test)

Training the Machine Learning Model

Logistic Regression

In [None]:
model=LogisticRegression(max_iter=1000)
model.fit(X_train,Y_train)

Model Evaluation

Accuracy Score

In [None]:
# Accuracy score on the training data
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)
print("Accuracy score of the training data : ",training_data_accuracy)

In [None]:
# Accuracy score on the testing data
X_test_prediction=model.predict(X_test)
testing_data_accuracy=accuracy_score(X_test_prediction,Y_test)
print("Accuracy score of the testing data : ",testing_data_accuracy)

Model Accuracy = 77.8 %

Saving the trained model

In [None]:
filename='trained_model.sav'
import pickle
pickle.dump(model,open(filename,'wb'))

Using the saved model for future predictions

In [None]:
# Loading the saved model
loaded_model=pickle.load(open(filename,'rb'))

In [None]:
X_new=X_test[200]
print(Y_test[200])
prediction=model.predict(X_new)
print(prediction)
if(prediction[0]==0):
  print("Negative Tweet")
else:
  print("Positive Tweet")

In [None]:
X_new=X_test[3]
print(Y_test[3])
prediction=loaded_model.predict(X_new)
print(prediction)
if(prediction[0]==0):
  print("Negative Tweet")
else:
  print("Positive Tweet")