In [1]:
#!pip install tweet-preprocessor

In [2]:
#!pip install --upgrade pip

In [3]:
#!pip install preprocessor

In [4]:
#!pip install -i https://pypi.anaconda.org/berber/simple tweet-preprocessor

In [5]:
#import required libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score

In [6]:
import preprocessor as p

In [7]:
#import data
train = pd.read_csv('train_tweet.csv')
test = pd.read_csv('test_tweet.csv')

In [8]:
#eda
train

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [9]:
test

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
...,...,...
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."


In [10]:
#racist/sexist tweets in train
print('The number of tweets that are not racist/sexist : ', sum(train.label == 0))
print('The number of tweets that are racist/sexist : ', sum(train.label == 1))

The number of tweets that are not racist/sexist :  29720
The number of tweets that are racist/sexist :  2242


In [11]:
#null values?
train.isna().sum()

id       0
label    0
tweet    0
dtype: int64

In [12]:
#data cleaning
#set up special charecters/punctuations we want to be replaced using regular expression
re_without_space = re.compile("(\.)|(\,)|(\{)|(\})|(\()|(\))|(\[)|(\])|(\;)|(\:)|(\!)|(\`)|(\')|(\")|(\%)|(\$)|(\<)|(\>)|(\?)|(\|)|")
re_with_space = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

In [13]:
#defining a function to clean the dataset using tweet preprocessor and re
def cleantweet(tweet):
        temp = p.clean(tweet)
        temp = re_without_space.sub('', temp.lower())
        temp = re_with_space.sub(' ', temp)
        return temp
        

In [14]:
#cleaning the train and test datasets
train['clean'] = train['tweet'].apply(cleantweet)
test['clean'] = test['tweet'].apply(cleantweet)

In [15]:
train

Unnamed: 0,id,label,tweet,clean
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i cant use cause they dont o...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now
...,...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...,ate isz that youuu
31958,31959,0,to see nina turner on the airwaves trying to...,to see nina turner on the airwaves trying to w...
31959,31960,0,listening to sad songs on a monday morning otw...,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,...",vandalised in in condemns act


In [16]:
label=train.pop('label')


In [17]:
label.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

In [18]:
train_02=train.drop('tweet',axis=1)


In [19]:
test_02 = test.drop('tweet',axis=1)

In [20]:
test_02

Unnamed: 0,id,clean
0,31963,to find
1,31964,want everyone to see the new and heres why
2,31965,safe ways to heal your
3,31966,is the hp and the cursed child book up for res...
4,31967,rd to my amazing hilarious eli ahmir uncle dav...
...,...,...
17192,49155,thought factory left right polarisation &gt3
17193,49156,feeling like a mermaid
17194,49157,today in omg &amp used words like assets&ampli...
17195,49158,happy at work conference right mindset leads t...


In [21]:
train_02=train_02.drop('id',axis=1)

In [22]:
train_02.replace("[^a-zA-Z]"," ",regex=True,inplace=True)

In [23]:
test_02=test_02.drop('id',axis=1)


In [24]:
test_02.replace("[^a-zA-Z]"," ",regex=True,inplace=True)


In [25]:
test_02

Unnamed: 0,clean
0,to find
1,want everyone to see the new and heres why
2,safe ways to heal your
3,is the hp and the cursed child book up for res...
4,rd to my amazing hilarious eli ahmir uncle dav...
...,...
17192,thought factory left right polarisation gt
17193,feeling like a mermaid
17194,today in omg amp used words like assets ampli...
17195,happy at work conference right mindset leads t...


In [26]:
id=test.pop('id')

In [27]:
id.shape


(17197,)

In [28]:
#def combine():
    
    #combined=train_02.append(test_02)
    #combined.reset_index(inplace=True)
    #return combined
    

In [29]:
#combined=combine()

In [30]:
#combined.shape
#combined.drop('index',axis=1,inplace=True)

In [31]:
#combined

In [32]:
#train['clean'].head()

In [33]:
#separating the train data set into inputs and targets
#x = train['clean']
#y = train['label']
x=train_02['clean']
y=label

In [34]:
todo=test_02['clean']

In [35]:
#combined.shape

In [36]:
#vectorize tweets using countvectorize
vectorizer = CountVectorizer(binary=True, stop_words = 'english',max_features = 15000)
#creating vectormatrix 
x_vec = vectorizer.fit_transform(x)
#vec=vectorizer.fit_transform(combined)

In [37]:
#vec


In [38]:
#x_vec

In [39]:
print(vectorizer.get_feature_names())



In [40]:
#train_03=vec[:31962,]
#test_03=vec[31962:,]

In [41]:
#spliting the train data for train and test split cross validation
#X_train, X_test, Y_train, Y_test = train_test_split(vec,label,stratify=label,test_size=0.3,random_state=2,shuffle=True)

In [42]:
#spliting the train data for train and test split cross validation
#x_train, x_test, y_train, y_test = train_test_split(x_vec,y,stratify=y,test_size=0.3,random_state=2,shuffle=True)

In [43]:
#build model
#linear svm model
#svm = svm.SVC(kernel='linear', probability = True)

In [44]:
#model_1= svm.fit(x_vec,y)

In [45]:
#x_vec_test = vectorizer.fit_transform(test_02['clean'])

In [46]:
#x_vec_test

In [47]:
#pred_test=model_1.predict(x_vec_test)

In [48]:
#pred_test

In [82]:
#f1 score
#from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix

#print("F1 score :", f1_score(y_test,y_pred ))


In [50]:
#cm = confusion_matrix(y_test, y_pred)
#cm

In [51]:
#prob = svm.fit(x_train, y_train)

In [52]:
#y_pred = svm.predict(x_test)

In [53]:
#accuracy_score(y_test,y_pred)*100

In [54]:
#creating output file
#test_pred = svm.predict()


#output
#pred=svm.predict(todo)

#my_submission = pd.DataFrame({'ID':test.id ,'label': pred})
#my_submission.to_csv('submission_1.csv', index=False)

In [55]:

#my_submission = pd.DataFrame({'id':id,'label': pred_test})
#my_submission.to_csv('twitter_2.csv', index=False)

In [56]:
#from sklearn.ensemble import RandomForestClassifier

In [57]:

#model_ran= RandomForestClassifier()
#model_ran.fit(x_train, y_train)




In [58]:
#y_pred_ran = model_ran.predict(x_test)



In [79]:
from sklearn.metrics import f1_score

#from sklearn.metrics import confusion_matrix

In [60]:

#print("Training Accuracy :", model_ran.score(x_train, y_train))
#print("Validation Accuracy :", model_ran.score(x_test, y_test))

# calculating the f1 score for the validation set
#print("F1 score :", f1_score(y_test, y_pred_ran))

# confusion matrix
#cm = confusion_matrix(y_test, y_pred_ran)#
#print(cm)



In [61]:
#y_pred_ran_actual=model_ran.predict(x_vec_test)

In [62]:
#y_pred_ran_actual

In [63]:
#my_submission = pd.DataFrame({'id':id,'label': y_pred_ran_actual})
#my_submission.to_csv('twitter_3.csv', index=False)

In [86]:
!pip install xgboost



In [65]:
import xgboost as xgb

In [66]:
from xgboost.sklearn import XGBClassifier

In [67]:
xgboost = XGBClassifier(ngram_range=(1,3), min_df=5, max_df=0.8,stop_words = 'english')

In [75]:
X_train_xgb = xgboost.fit(x_train1,y_train1)



Parameters: { "max_df", "min_df", "ngram_range", "stop_words" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [None]:
from platform import python_version

print(python_version())

In [70]:

from sklearn.feature_extraction.text import TfidfVectorizer


In [71]:
tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_df=0.8,stop_words = 'english')

In [72]:
X_train_tfidf = tfidf.fit_transform(x)
#X_test_tfidf = tfidf.transform()

In [73]:
X_train_tfidf

<31962x6159 sparse matrix of type '<class 'numpy.float64'>'
	with 154927 stored elements in Compressed Sparse Row format>

In [74]:
#spliting the train data for train and test split cross validation
x_train1, x_test1, y_train1, y_test1 = train_test_split(X_train_tfidf,y,stratify=y,test_size=0.3,random_state=2,shuffle=True)

In [None]:
#from sklearn import svm

#svm2 = svm.SVC(kernel='linear', probability = True)
#model_2= svm2.fit(x_train1,y_train1)

In [None]:
#pred_tfidf=model_2.predict(x_test1)

In [None]:
#pred_tfidf

In [None]:
#f1_score(y_test1, pred_tfidf)

In [None]:
#confusion_matrix(y_test1, pred_tfidf)

In [76]:
X_test_tfidf = tfidf.transform(test_02['clean'])

In [None]:
#pred_tfidf_act=model_2.predict(X_test_tfidf)

In [None]:
#pred_tfidf_act

In [None]:
#my_submission = pd.DataFrame({'id':id,'label': pred_tfidf_act})
#my_submission.to_csv('twitter_4.csv', index=False)

In [77]:
# for xbost
pred_xg=X_train_xgb.predict(x_test1)

In [80]:
f1_score(y_test1,pred_xg)

0.4209328782707622

In [83]:
confusion_matrix(y_test1,pred_xg)

array([[8895,   21],
       [ 488,  185]], dtype=int64)

In [84]:
pred_xg_act=X_train_xgb.predict(X_test_tfidf)

In [85]:
my_submission = pd.DataFrame({'id':id,'label':pred_xg_act })
my_submission.to_csv('twitter_5.csv', index=False)