<a href="https://colab.research.google.com/github/garg0711/Twitter_sentiment/blob/main/Twitter_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
# API to fetch data from kaggle
!kaggle datasets download -d kazanova/sentiment140

Downloading sentiment140.zip to /content
 96% 78.0M/80.9M [00:01<00:00, 81.7MB/s]
100% 80.9M/80.9M [00:01<00:00, 68.8MB/s]


In [5]:
# Extract the data
from zipfile import ZipFile
data_path = '/content/sentiment140.zip'


with ZipFile(data_path, 'r') as zip:
  zip.extractall()


In [7]:
# Impoting the Libraries
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Data Processing

# Loading data
data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',encoding='latin-1')
data

In [None]:
data.shape

In [None]:
data.sample(5)

In [17]:
# Renaming the datacolumns
column_name = ['target', 'id', 'date', 'flag', 'user', 'text']
data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',names =column_name ,encoding='latin-1')

In [19]:
data.shape

(1600000, 6)

In [21]:
# Checking null values
data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

 0 : Negative
1 : Positive

In [25]:
# Checking the distribution of target
data['target'].value_counts()
data.replace({'target': {4: 1}}, inplace=True)

In [27]:
p_stem = PorterStemmer()

def stemming(content):
    stemmed_con = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_con = stemmed_con.lower().split()
    stemmed_con = [p_stem.stem(word) for word in stemmed_con if word not in stopwords.words('english')]
    stemmed_con = ' '.join(stemmed_con)
    return stemmed_con


In [28]:
data["stem_content"] = data['text'].apply(stemming)
data.head()

Unnamed: 0,target,id,date,flag,user,text,stem_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [39]:
data.to_csv('twitter_data.csv', index = False)

In [29]:
# Assignign input and output
X = data['stem_content'].values
y = data['target'].values



In [32]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 50)

In [33]:
# Converting content to numerical data
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [34]:
# Training the Model

model = LogisticRegression(max_iter = 1000)
model.fit(X_train, y_train)

In [37]:
# Model Evaluation

# Accuracy Score
X_train_pred = model.predict(X_train)

training_data_acc = accuracy_score(y_train, X_train_pred)
print('Train Data Accuracy', end =' ')
print(training_data_acc)

X_test_pred = model.predict(X_test)

test_data_acc = accuracy_score(y_test, X_test_pred)
print('Test Data Accuracy', end =' ')
print(test_data_acc)



Train Data Accuracy 0.8101765625
Test Data Accuracy 0.778675


In [38]:
import pickle

# Saving Model
name = 'Tweet_model.sav'
pickle.dump(model, open(name, 'wb'))