In [1]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Using cached tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [2]:
#!pip install preprocessor

In [3]:
#!pip install -i https://pypi.anaconda.org/berber/simple tweet-preprocessor

Looking in indexes: https://pypi.anaconda.org/berber/simple


In [4]:
#import required libraries
import pandas as pd
import numpy as np
import re
import preprocessor as p
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score

In [5]:
#import data
train = pd.read_csv('train_tweet.csv')
test = pd.read_csv('test_tweet.csv')

In [6]:
#eda
train

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [7]:
test

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
...,...,...
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."


In [8]:
#racist/sexist tweets in train
print('The number of tweets that are not racist/sexist : ', sum(train.label == 0))
print('The number of tweets that are racist/sexist : ', sum(train.label == 1))

The number of tweets that are not racist/sexist :  29720
The number of tweets that are racist/sexist :  2242


In [9]:
#null values?
train.isna().sum()

id       0
label    0
tweet    0
dtype: int64

In [10]:
#data cleaning
#set up special charecters/punctuations we want to be replaced using regular expression
re_without_space = re.compile("(\.)|(\,)|(\{)|(\})|(\()|(\))|(\[)|(\])|(\;)|(\:)|(\!)|(\`)|(\')|(\")|(\%)|(\$)|(\<)|(\>)|(\?)|(\|)|")
re_with_space = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

In [11]:
#defining a function to clean the dataset using tweet preprocessor and re
def cleantweet(tweet):
        temp = p.clean(tweet)
        temp = re_without_space.sub('', temp.lower())
        temp = re_with_space.sub(' ', temp)
        temp = re.sub('[0-9]', '', temp)
        temp = re.sub('_', '', temp)
        return temp        

In [12]:
#cleaning the train and test datasets
train['clean'] = train['tweet'].apply(cleantweet)
test['clean'] = test['tweet'].apply(cleantweet)

In [13]:
#separating the train data set into inputs and targets
x_train = train['clean']
y_train = train['label']

In [14]:
x_test=test['clean']

In [15]:
def combine():
    combined = x_train.append(x_test)
    combined = combined.reset_index()
    return combined    

In [16]:
combined = combine().drop(['index'],axis=1)

In [17]:
combined

Unnamed: 0,clean
0,when a father is dysfunctional and is so selfi...
1,thanks for credit i cant use cause they dont o...
2,bihday your majesty
3,i love u take with u all the time in ur
4,factsguide society now
...,...
49154,thought factory left right polarisation &gt
49155,feeling like a mermaid
49156,today in omg &amp used words like assets&ampli...
49157,happy at work conference right mindset leads t...


In [18]:
#vectorize tweets using countvectorize
vectorizer = CountVectorizer(binary=True, stop_words = 'english')
#creating vectormatrix 
countVector = vectorizer.fit_transform(combined['clean'])

In [19]:
print('{} Number of reviews has {} words'.format(countVector.shape[0], countVector.shape[1]))

49159 Number of reviews has 27846 words


In [20]:
print(vectorizer.get_feature_names())



In [21]:
count_vect_df = pd.DataFrame(countVector.toarray(), columns=vectorizer.get_feature_names())
count_vect_df.head()

Unnamed: 0,aa,aaa,aaaaa,aaaaaand,aaaaah,aaaaand,aaaahhh,aaaahhhh,aaaannndd,aaahh,...,zulu,zuma,zumba,zurich,zx,zydeco,zz,zzz,zzzzzzs,zzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
count_vect_df.shape

(49159, 27846)

In [23]:
train_cv = count_vect_df.iloc[:31962,]

In [24]:
test_cv = count_vect_df.iloc[31962:,]
test_cv.shape

(17197, 27846)

In [25]:
#build model
#linear svm model
svm = svm.SVC(kernel='linear', probability = True)

In [None]:
model = svm.fit(train_cv, y_train)

In [None]:
y_pred = svm.predict(x_test)

In [None]:
#creating output file
my_submission = pd.DataFrame({'ID':test.id ,'label': y_pred})
my_submission.to_csv('submission_twitter.csv', index=False)