In [1]:
#import required libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
import preprocessor as p

In [4]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Using cached tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [3]:
#import data
train = pd.read_csv('train_E6oV3lV.csv')
test = pd.read_csv('test_tweets_anuFYb8.csv')

In [4]:
#eda
train

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [5]:
test

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
...,...,...
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."


In [6]:
#racist/sexist tweets in train
print('The number of tweets that are not racist/sexist : ', sum(train.label == 0))
print('The number of tweets that are racist/sexist : ', sum(train.label == 1))

The number of tweets that are not racist/sexist :  29720
The number of tweets that are racist/sexist :  2242


In [7]:
#null values?
train.isna().sum()

id       0
label    0
tweet    0
dtype: int64

In [8]:
#data cleaning
#set up special charecters/punctuations we want to be replaced using regular expression
re_without_space = re.compile("(\.)|(\,)|(\{)|(\})|(\()|(\))|(\[)|(\])|(\;)|(\:)|(\!)|(\`)|(\')|(\")|(\%)|(\$)|(\<)|(\>)|(\?)|(\|)|")
re_with_space = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

In [9]:
#defining a function to clean the dataset using tweet preprocessor and re
def cleantweet(tweet):
        temp = p.clean(tweet)
        temp = re_without_space.sub('', temp.lower())
        temp = re_with_space.sub(' ', temp)
        temp = re.sub('[0-9]', '', temp)
        temp = re.sub('_', '', temp)
        return temp
        

In [10]:
#cleaning the train and test datasets
train['clean'] = train['tweet'].apply(cleantweet)
test['clean'] = test['tweet'].apply(cleantweet)

In [11]:
train

Unnamed: 0,id,label,tweet,clean
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i cant use cause they dont o...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now
...,...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...,ate isz that youuu
31958,31959,0,to see nina turner on the airwaves trying to...,to see nina turner on the airwaves trying to w...
31959,31960,0,listening to sad songs on a monday morning otw...,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,...",vandalised in in condemns act


In [12]:
label=train.pop('label')


In [13]:
import nltk
#nltk.download('wordnet')

In [14]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

In [15]:
lemma = WordNetLemmatizer()


def lemmatizing(text):
    text_token = word_tokenize(text)
    stemmed_words = [lemma.lemmatize(word) for word in text_token]
    clean_text = " ".join(stemmed_words)
    clean_text = clean_text.replace('   ', ' ')
    clean_text = clean_text.replace('  ', ' ')
    return clean_text   

In [16]:
train['clean2'] = train['clean'].apply(lemmatizing)
test['clean2'] = test['clean'].apply(lemmatizing)

In [17]:
test

Unnamed: 0,id,tweet,clean,clean2
0,31963,#studiolife #aislife #requires #passion #dedic...,to find,to find
1,31964,@user #white #supremacists want everyone to s...,want everyone to see the new and heres why,want everyone to see the new and here why
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your,safe way to heal your
3,31966,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",rd to my amazing hilarious eli ahmir uncle dav...,rd to my amazing hilarious eli ahmir uncle dav...
...,...,...,...,...
17192,49155,thought factory: left-right polarisation! #tru...,thought factory left right polarisation &gt,thought factory left right polarisation & gt
17193,49156,feeling like a mermaid ð #hairflip #neverre...,feeling like a mermaid,feeling like a mermaid
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,today in omg &amp used words like assets&ampli...,today in omg & amp used word like asset & ampl...
17195,49158,"happy, at work conference: right mindset leads...",happy at work conference right mindset leads t...,happy at work conference right mindset lead to...


In [19]:
label.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

In [18]:
id=test.pop('id')

In [19]:
id.shape


(17197,)

In [40]:
#def combine():
    
    #combined=train_02.append(test_02)
    #combined.reset_index(inplace=True)
    #return combined
    

In [41]:
#combined=combine()

In [42]:
#combined.shape
#combined.drop('index',axis=1,inplace=True)

In [43]:
#combined

In [44]:
#train['clean'].head()

In [25]:
#separating the train data set into inputs and targets
#x = train['clean']
#y = train['label']
x=train['clean2']
y=label

In [49]:
#combined.shape

In [54]:
#vectorize tweets using countvectorize
#vectorizer = CountVectorizer(binary=True, stop_words = 'english',max_features = 15000)
#creating vectormatrix 
#x_vec = vectorizer.fit_transform(x)
#vec=vectorizer.fit_transform(combined)

In [97]:
#vec


<1x1 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [51]:
#x_vec

<31962x15000 sparse matrix of type '<class 'numpy.int64'>'
	with 147167 stored elements in Compressed Sparse Row format>

In [23]:
#print(vectorizer.get_feature_names())

In [86]:
#train_03=vec[:31962,]
#test_03=vec[31962:,]

In [106]:
#spliting the train data for train and test split cross validation
#X_train, X_test, Y_train, Y_test = train_test_split(vec,label,stratify=label,test_size=0.3,random_state=2,shuffle=True)

In [52]:
#spliting the train data for train and test split cross validation
#x_train, x_test, y_train, y_test = train_test_split(x_vec,y,stratify=y,test_size=0.3,random_state=2,shuffle=True)

In [53]:
#build model
#linear svm model
#svm = svm.SVC(kernel='linear', probability = True)

In [None]:
#model_1= svm.fit(x_vec,y)

In [64]:
#x_vec_test = vectorizer.fit_transform(test_02['clean'])

In [65]:
#x_vec_test

<17197x15000 sparse matrix of type '<class 'numpy.int64'>'
	with 82099 stored elements in Compressed Sparse Row format>

In [130]:
#pred_test=model_1.predict(x_vec_test)

In [134]:
#pred_test

array([0, 0, 0, ..., 0, 0, 0])

In [138]:
#f1 score
#from sklearn.metrics import f1_score

#from sklearn.metrics import confusion_matrix

#print("F1 score :", f1_score(y_test,y_pred ))


F1 score : 0.5691489361702128


In [139]:
#cm = confusion_matrix(y_test, y_pred)
#cm

array([[8782,  134],
       [ 352,  321]])

In [131]:
#prob = svm.fit(x_train, y_train)

In [132]:
#y_pred = svm.predict(x_test)

In [133]:
#accuracy_score(y_test,y_pred)*100

94.93169256439671

In [None]:
#creating output file
#test_pred = svm.predict()


#output
#pred=svm.predict(todo)

#my_submission = pd.DataFrame({'ID':test.id ,'label': pred})
#my_submission.to_csv('submission_1.csv', index=False)

In [144]:

#my_submission = pd.DataFrame({'id':id,'label': pred_test})
#my_submission.to_csv('twitter_2.csv', index=False)

In [None]:
#random forest

In [57]:
#from sklearn.ensemble import RandomForestClassifier

In [58]:

#model_ran= RandomForestClassifier()
#model_ran.fit(x_train, y_train)






RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [59]:
#y_pred_ran = model_ran.predict(x_test)



In [20]:
from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix

In [62]:

#print("Training Accuracy :", model_ran.score(x_train, y_train))
#print("Validation Accuracy :", model_ran.score(x_test, y_test))

# calculating the f1 score for the validation set
#print("F1 score :", f1_score(y_test, y_pred_ran))

# confusion matrix
#cm = confusion_matrix(y_test, y_pred_ran)
#print(cm)



Training Accuracy : 0.9922227685156215
Validation Accuracy : 0.9457711961622692
F1 score : 0.5406360424028268
[[8763  153]
 [ 367  306]]


In [66]:
#y_pred_ran_actual=model_ran.predict(x_vec_test)

In [67]:
#y_pred_ran_actual

array([0, 0, 0, ..., 0, 0, 0])

In [68]:
#my_submission = pd.DataFrame({'id':id,'label': y_pred_ran_actual})
#my_submission.to_csv('twitter_3.csv', index=False)

In [2]:
#!pip install xgboost

You should consider upgrading via the '/Users/lakshmipriya/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [6]:
from platform import python_version

print(python_version())

3.7.3


In [21]:

from sklearn.feature_extraction.text import TfidfVectorizer


In [22]:
tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_df=0.8,stop_words = 'english')

In [23]:
tfidf

TfidfVectorizer(max_df=0.8, min_df=5, ngram_range=(1, 3), stop_words='english')

In [26]:
X_train_tfidf = tfidf.fit_transform(x)
#X_test_tfidf = tfidf.transform()

In [27]:
X_train_tfidf

<31962x6037 sparse matrix of type '<class 'numpy.float64'>'
	with 160063 stored elements in Compressed Sparse Row format>

In [28]:
#spliting the train data for train and test split cross validation
x_train1, x_test1, y_train1, y_test1 = train_test_split(X_train_tfidf,y,stratify=y,test_size=0.3,random_state=2,shuffle=True)

In [29]:
from sklearn import svm

svm2 = svm.SVC(kernel='linear', probability = True)
model_2= svm2.fit(x_train1,y_train1)

In [78]:
pred_tfidf=model_2.predict(x_test1)

In [79]:
pred_tfidf

array([0, 1, 0, ..., 0, 0, 0])

In [81]:

from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix


f1_score(y_test1, pred_tfidf)

0.492972972972973

In [82]:
confusion_matrix(y_test1, pred_tfidf)

array([[8892,   24],
       [ 445,  228]])

In [30]:
X_test_tfidf = tfidf.transform(test['clean2'])

In [84]:
pred_tfidf_act=model_2.predict(X_test_tfidf)

In [85]:
pred_tfidf_act

array([0, 0, 0, ..., 0, 0, 0])

In [86]:
my_submission = pd.DataFrame({'id':id,'label': pred_tfidf_act})
my_submission.to_csv('twitter_5.csv', index=False)

In [5]:
import xgboost

In [88]:
pip install gensim

Collecting gensim
  Downloading gensim-4.0.1-cp38-cp38-macosx_10_9_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 331 kB/s eta 0:00:01     |██████████████▋                 | 10.9 MB 502 kB/s eta 0:00:26     |██████████████████████████████▊ | 23.0 MB 194 kB/s eta 0:00:05
[?25hCollecting smart-open>=1.8.1
  Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 536 kB/s eta 0:00:01
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.0.1 smart-open-5.1.0
Note: you may need to restart the kernel to use updated packages.


In [89]:
import gensim



In [91]:
tokenized_tweet = train['clean2'].apply(lambda x: x.split()) 

In [93]:
model_w2v = gensim.models.Word2Vec(
            tokenized_tweet,
            vector_size=200, # desired no. of features/independent variables 
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 2, # no.of cores
            seed = 34)


In [94]:
model_w2v.train(tokenized_tweet, total_examples= len(train['clean2']), epochs=20)

(4665180, 6285280)

In [95]:
model_w2v.wv.most_similar(positive = "life")

[('fullest', 0.48751944303512573),
 ('eliminate', 0.4714736044406891),
 ('selah', 0.4652027487754822),
 ('existence', 0.4591760039329529),
 ('hathaway', 0.45492711663246155),
 ('sober', 0.4410242736339569),
 ('shyan', 0.43827003240585327),
 ('possibility', 0.4355795681476593),
 ('emptiness', 0.43454962968826294),
 ('necessary', 0.431913286447525)]

In [96]:
model_w2v.wv.most_similar(negative = "hate")

[('linstagram', 0.09035412967205048),
 ('recordsmanagervx', 0.05600422993302345),
 ('dragoneducation', 0.0021256180480122566),
 ('rssxactaccounts', -0.018716683611273766),
 ('lozza', -0.019833624362945557),
 ('ilovethesecret', -0.026626083999872208),
 ('stamp', -0.028266310691833496),
 ('sadwav', -0.03171248361468315),
 ('oil', -0.05767695978283882),
 ('camiilabeckeer', -0.05887111648917198)]

In [100]:
from sklearn.tree import DecisionTreeClassifier

In [102]:


model_dt = DecisionTreeClassifier()
model_dt.fit(x_train1, y_train1)



DecisionTreeClassifier()

In [104]:
y_pred_dt = model_dt.predict(x_test1)



In [107]:
print("Training Accuracy :", model_dt.score(x_train1, y_train1))
print("Validation Accuracy :", model_dt.score(x_test1, y_test1))

# calculating the f1 score for the validation set
print("f1 score :", f1_score(y_test1, y_pred_dt))

# confusion matrix
cm = confusion_matrix(y_test1, y_pred_dt)
print(cm)

Training Accuracy : 0.9956197201984535
Validation Accuracy : 0.9417040358744395
f1 score : 0.5552903739061258
[[8681  235]
 [ 324  349]]


In [108]:
y_pred_dt_act=model_dt.predict(X_test_tfidf)

In [109]:
my_submission = pd.DataFrame({'id':id,'label': y_pred_dt_act})
my_submission.to_csv('twitter_6.csv', index=False)

In [111]:
from xgboost import XGBClassifier

In [112]:
model_xgb = XGBClassifier()
model_xgb.fit(x_train1, y_train1)






XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [113]:
y_pred_xgb = model_xgb.predict(x_test1)



In [114]:
print("Training Accuracy :", model_xgb.score(x_train1, y_train1))
print("Validation Accuracy :", model_xgb.score(x_test1, y_test1))

# calculating the f1 score for the validation set
print("f1 score :", f1_score(y_test1, y_pred_xgb))

# confusion matrix
cm = confusion_matrix(y_test1, y_pred_xgb)
print(cm)

Training Accuracy : 0.953068430697716
Validation Accuracy : 0.9475440609031182
f1 score : 0.44664466446644663
[[8883   33]
 [ 470  203]]


In [115]:
y_pred_xgb_act=model_xgb.predict(X_test_tfidf)

In [116]:
my_submission = pd.DataFrame({'id':id,'label': y_pred_xgb_act})
my_submission.to_csv('twitter_7.csv', index=False)

In [117]:
from sklearn.linear_model import LogisticRegression

In [118]:
model_lr=LogisticRegression()
model_lr.fit(x_train1,y_train1)

LogisticRegression()

In [119]:
y_pred_lr=model_lr.predict(x_test1)

In [120]:
print("Training Accuracy :", model_lr.score(x_train1, y_train1))
print("Validation Accuracy :", model_lr.score(x_test1, y_test1))


Training Accuracy : 0.9469002815894159
Validation Accuracy : 0.9452497653561373


In [121]:
f1_score(y_test1,y_pred_lr)

0.3772241992882562

In [122]:
confusion_matrix(y_test1, y_pred_lr)


array([[8905,   11],
       [ 514,  159]])

In [123]:
y_pred_lr_act=model_lr.predict(X_test_tfidf)

In [125]:
sub=pd.DataFrame({'id':id,'label':y_pred_lr_act})
sub.to_csv('twitter_8.csv',index=False)

In [126]:
from sklearn.ensemble import RandomForestClassifier


In [127]:
rf=RandomForestClassifier()

model_rf_1 = RandomForestClassifier()
model_rf_1.fit(x_train1, y_train1)



RandomForestClassifier()

In [128]:
y_pred_rf = model_rf_1.predict(x_test1)



In [129]:
print("Training Accuracy :", model_rf_1.score(x_train1, y_train1))
print("Validation Accuracy :", model_rf_1.score(x_test1, y_test1))

# calculating the f1 score for the validation set
print("F1 score :", f1_score(y_test1, y_pred_rf))

# confusion matrix
cm = confusion_matrix(y_test1, y_pred_rf)
print(cm)

Training Accuracy : 0.9955750234657846
Validation Accuracy : 0.9540098028991553
F1 score : 0.5920444033302497
[[8828   88]
 [ 353  320]]


In [130]:
y_pred_rf_act=model_rf_1.predict(X_test_tfidf)

In [131]:
sb=pd.DataFrame({'id':id,'label':y_pred_rf_act})
sb.to_csv('twitter_9.csv',index=False)

In [34]:
#pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.5.0-cp38-cp38-macosx_10_11_x86_64.whl (195.7 MB)
[K     |████████████████████████████████| 195.7 MB 50 kB/s  eta 0:00:01    |██▉                             | 17.6 MB 526 kB/s eta 0:05:39     |████                            | 24.3 MB 549 kB/s eta 0:05:13     |████                            | 24.8 MB 549 kB/s eta 0:05:12     |█████▎                          | 32.1 MB 344 kB/s eta 0:07:56     |█████▉                          | 35.9 MB 322 kB/s eta 0:08:16     |███████▋                        | 46.4 MB 356 kB/s eta 0:06:59     |█████████▎                      | 56.8 MB 414 kB/s eta 0:05:36     |█████████▋                      | 58.5 MB 417 kB/s eta 0:05:29     |█████████▋                      | 58.6 MB 417 kB/s eta 0:05:29     |█████████████████████▏          | 129.5 MB 477 kB/s eta 0:02:19     |████████████████████████████    | 170.8 MB 1.2 MB/s eta 0:00:22
[?25hCollecting opt-einsum~=3.3.0
  Using cached opt_einsum-3.3.0-py3-none-any.

In [31]:
import tensorflow as tf

In [32]:
import keras

In [33]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier

In [34]:
tokenizer = Tokenizer()

In [35]:

tokenizer.fit_on_texts(train['clean2'])


In [None]:
train_seq = tokenizer.texts_to_sequences(Xtrain['clean2'])
train_pad = pad_sequences(train_seq, maxlen = max_len)
test_seq = tokenizer.texts_to_sequences(X_test_lem)
test_pad = pad_sequences(test_seq, maxlen = max_len)
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 300