In [1]:
import math
import nltk
import scipy
import string
import re
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm import tqdm
from scipy.stats import randint
from wordcloud import WordCloud
from multiprocessing import Pool
from nltk.corpus import stopwords
from scipy.stats import loguniform

from sklearn.decomposition import PCA
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,RandomizedSearchCV,RepeatedStratifiedKFold,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score,roc_auc_score, roc_curve, precision_score, recall_score
# from scikitplot.metrics import plot_roc_curve as auc_roc
import matplotlib.pyplot as plt
from IPython.display import display
plt.rcParams['figure.figsize'] = [20,6]
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
import warnings 
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('Tweets.csv', header=0)

In [3]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
df.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,61fc1bd58a,my tvs not working i wanna watch vhits :`(,good,neutral
freq,1,1,199,11118


In [5]:
df.drop(['selected_text', 'textID'], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [7]:
df.shape

(27481, 2)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       27480 non-null  object
 1   sentiment  27481 non-null  object
dtypes: object(2)
memory usage: 429.5+ KB


In [9]:
df.dropna(inplace=True)
second_df = df.copy(deep=True)
second_df

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive
27479,But it was worth it ****.,positive


In [10]:
df[df.duplicated()]

Unnamed: 0,text,sentiment


In [11]:
counter = 0
r,c = second_df.shape

df_nodup = df.drop_duplicates()
df_nodup.reset_index(drop=True, inplace=True)

In [12]:
df_nodup.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [13]:
df_filter = df_nodup.copy()

def preprocessor(text):
    #text = re.sub('[http:,https:]','',text)
    text = re.sub('[^a-zA-Z]',' ',text)
    text = text.lower()
    text = text.strip()
    text = ''.join([i for i in text if i in string.ascii_lowercase+' '])
    text = ' '.join([word for word in text.split() if word.isalnum()])
    text = ' '.join([WordNetLemmatizer().lemmatize(word,pos='v') for word in text.split()])    
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

In [14]:
df_filter['text'] = df_nodup['text'].apply(preprocessor)

df_filter.head()

Unnamed: 0,text,sentiment
0,respond go,neutral
1,sooo sad miss san diego,negative
2,boss bully,negative
3,interview leave alone,negative
4,sons put release already buy,negative


In [17]:
porter=PorterStemmer()
target = 'sentiment'
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [19]:
df_vector=TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None,tokenizer=tokenizer_porter,use_idf=True,norm='l2',smooth_idf=True)
label=df_filter[target].values
features=df_vector.fit_transform(df_filter.text)

In [21]:
print(label)
print(features)

['neutral' 'negative' 'negative' ... 'positive' 'positive' 'neutral']
  (0, 6637)	0.3776400355368082
  (0, 13899)	0.9259524845043391
  (1, 4349)	0.5774904410682079
  (1, 14380)	0.5374236061146065
  (1, 10732)	0.28866086704873706
  (1, 14309)	0.33255522986570935
  (1, 15440)	0.4286753794997508
  (2, 2290)	0.7669109047171836
  (2, 1982)	0.6417535852847812
  (3, 468)	0.5878637688905023
  (3, 9454)	0.45003602763553296
  (3, 8255)	0.6722230009869373
  (4, 2379)	0.3991796814525973
  (4, 478)	0.389645210134384
  (4, 13790)	0.522200391827341
  (4, 13339)	0.41179156328860056
  (4, 15421)	0.4965547817858938
  (5, 4870)	0.2954023249782883
  (5, 6036)	0.31519803969485477
  (5, 13549)	0.34631673286147474
  (5, 1576)	0.20131738013490516
  (5, 12868)	0.31519803969485477
  (5, 14747)	0.3866654464821108
  (5, 15259)	0.3866654464821108
  (5, 3307)	0.1629500188088339
  :	:
  (27476, 280)	0.34339318025545407
  (27476, 9448)	0.22763852799456433
  (27476, 6000)	0.2728516815929536
  (27476, 18708)	0.20526189