In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
! pip install -q kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list
! kaggle competitions download -c 'name-of-competition'

ref                                                         title                                              size  lastUpdated          downloadCount  
----------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  
gpreda/reddit-vaccine-myths                                 Reddit Vaccine Myths                              229KB  2021-06-01 11:18:46           7012  
crowww/a-large-scale-fish-dataset                           A Large Scale Fish Dataset                          3GB  2021-04-28 17:03:01           4221  
imsparsh/musicnet-dataset                                   MusicNet Dataset                                   22GB  2021-02-18 14:12:19           1407  
dhruvildave/wikibooks-dataset                               Wikibooks Dataset                                   1GB  2021-02-18 10:08:27           2161  
mathurinache/twitter-edge-nodes                             Twitter Edge Nod

In [4]:
!kaggle datasets download -d kazanova/sentiment140

Downloading sentiment140.zip to /content
 91% 74.0M/80.9M [00:00<00:00, 63.7MB/s]
100% 80.9M/80.9M [00:00<00:00, 92.7MB/s]


In [5]:
!unzip sentiment140.zip

Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [6]:
df_columns  = ["sentiment", "ids", "date", "flag", "user", "text"]
df_encoding = "ISO-8859-1"
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding=df_encoding , names=df_columns)
# Removing the unnecessary columns and considering the Sentiment and text columns.
df = df[['sentiment','text']]
# Replacing the values to ease understanding.
df.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [7]:
import re

url_patterns        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
user_patterns       = '@[^\s]+'
alpha_patterns      = "[^a-zA-Z0-9]"
sequence_patterns   = r"(.)\1\1+"
seqreplace_patterns = r"\1\1"
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed',  ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy','@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused', '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

def preprocess(text):
    text = text.lower()

    # Replace all URls with 'URL'
    text = re.sub(url_patterns,' URL',text)
    # Replace all emojis.
    for emoji in emojis.keys():
        text = text.replace(emoji, "EMOJI" + emojis[emoji])        
    # Replace @USERNAME to 'USER'.
    text = re.sub(user_patterns,' USER', text)        
    # Replace all non alphabets.
    text = re.sub(alpha_patterns, " ", text)
    # Replace 3 or more consecutive letters by 2 letter.
    text = re.sub(sequence_patterns, seqreplace_patterns, text)
    return text

In [8]:
from tqdm.notebook import tqdm
tqdm.pandas()
df['pre_text']= df['text'].progress_apply(lambda x: preprocess(x))

  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, max=1600000.0), HTML(value='')))




In [9]:
df.head()

Unnamed: 0,sentiment,text,pre_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",USER URL aww that s a bummer you shoulda ...
1,0,is upset that he can't update his Facebook by ...,is upset that he can t update his facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...,USER i dived many times for the ball managed...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....",USER no it s not behaving at all i m mad w...


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X_train, X_test, y_train, y_test = train_test_split(df['pre_text'], df['sentiment'], test_size=0.05, random_state=42, stratify = df['sentiment'])
vectors = TfidfVectorizer(ngram_range=(1,2), max_features=200000)
vectors.fit(X_train)
X_train_transform, X_test_transform = vectors.transform(X_train), vectors.transform(X_test)
vectors.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectors.get_feature_names()))

Vectoriser fitted.
No. of feature_words:  200000


In [11]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
model = MultinomialNB()
model.fit(X_train_transform, y_train)
preds = model.predict(X_test_transform)

print('Accuracy:', accuracy_score(y_test, preds))
print('Confusion Matrix:\n', confusion_matrix(y_test, preds))
print('Classification Report: \n',classification_report(y_test, preds, target_names=['NEGATIVE', 'POSITIVE']))

Accuracy: 0.804725
Confusion Matrix:
 [[32453  7547]
 [ 8075 31925]]
Classification Report: 
               precision    recall  f1-score   support

    NEGATIVE       0.80      0.81      0.81     40000
    POSITIVE       0.81      0.80      0.80     40000

    accuracy                           0.80     80000
   macro avg       0.80      0.80      0.80     80000
weighted avg       0.80      0.80      0.80     80000



In [12]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression(C = 7, max_iter = 1000, n_jobs=-1)
model1.fit(X_train_transform, y_train)
preds = model1.predict(X_test_transform)

print('Accuracy:', accuracy_score(y_test, preds))
print('Confusion Matrix:\n', confusion_matrix(y_test, preds))
print('Classification Report: \n',classification_report(y_test, preds, target_names=['NEGATIVE', 'POSITIVE']))

Accuracy: 0.81895
Confusion Matrix:
 [[32407  7593]
 [ 6891 33109]]
Classification Report: 
               precision    recall  f1-score   support

    NEGATIVE       0.82      0.81      0.82     40000
    POSITIVE       0.81      0.83      0.82     40000

    accuracy                           0.82     80000
   macro avg       0.82      0.82      0.82     80000
weighted avg       0.82      0.82      0.82     80000



In [13]:
di = {0: 'NEGATIVE',1: 'POSITIVE'}
def get_sentiment(model,text):
    text = preprocess(text)
    text = vectors.transform([text])
    pred = model.predict(text)[0]
    print(di[pred])

In [14]:
get_sentiment(model,df.iloc[0]['pre_text']) #naivebayes prediction

NEGATIVE


In [15]:
get_sentiment(model1,df.iloc[0]['pre_text']) #logistic prediction

NEGATIVE
