In [101]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('charliehebdo_tweets.csv')

In [3]:
data.shape

(38268, 13)

In [4]:
data.head()

Unnamed: 0,tweet_id,tweet_hashtag,tweet_text,retweet,retweet_source_id,retweet_count,is_fake,user_verified,user_followers_count,user_statuses_count,user_friends_count,user_favourites_count,tweet_relative_age
0,552821987006230528,charliehebdo,@BBCWorld #noussommestousdesCharlieHebdo,False,552821287933202435,1,True,False,43,1919,74,1,4
1,553519787943690242,charliehebdo,RT @AP: French terror suspects want to be mart...,False,553518472798683136,0,True,False,1400,4712,1574,302,8
2,552837042624020482,charliehebdo,@beckyt141 anyway he got it bang on again! He'...,False,552835987366486016,0,True,False,124,1725,262,129,10
3,552822076160360448,charliehebdo,@Telegraph That should be basic military skill...,False,552820736490889216,0,True,False,951,26050,1957,4559,9
4,552840282094727168,charliehebdo,@keller_lind @HuffPostUK @CEMB_forum Assassina...,False,552824008379736064,0,True,False,5097,89237,3472,17139,8


In [34]:
X = data.loc[:,['tweet_text','user_verified','user_followers_count','user_statuses_count','user_friends_count','user_favourites_count','tweet_relative_age']]
y = data[['is_fake']]


In [35]:
X.head()

Unnamed: 0,tweet_text,user_verified,user_followers_count,user_statuses_count,user_friends_count,user_favourites_count,tweet_relative_age
0,@BBCWorld #noussommestousdesCharlieHebdo,False,43,1919,74,1,4
1,RT @AP: French terror suspects want to be mart...,False,1400,4712,1574,302,8
2,@beckyt141 anyway he got it bang on again! He'...,False,124,1725,262,129,10
3,@Telegraph That should be basic military skill...,False,951,26050,1957,4559,9
4,@keller_lind @HuffPostUK @CEMB_forum Assassina...,False,5097,89237,3472,17139,8


In [36]:
y.head()


Unnamed: 0,is_fake
0,True
1,True
2,True
3,True
4,True


# 1) Logistic Regression

###  model with all other features except tweet text

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [41]:
#  remove the tweet text coloumn
X_train_new = X_train.iloc[:,1:]
X_test_new =  X_test.iloc[:,1:]

In [39]:
X_train_new.head()

Unnamed: 0,user_verified,user_followers_count,user_statuses_count,user_friends_count,user_favourites_count,tweet_relative_age
5025,False,69,9975,294,1707,10
13639,False,79,3185,131,628,7
14129,False,22,1732,42,431,0
13757,False,773,7109,315,1740,6
21501,False,62,2808,63,396,6


In [40]:
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(X_train_new, y_train)

In [42]:
clf.score(X_test_new,y_test)

0.8039433050914562

# BOW representation of tweets

In [57]:
vect = CountVectorizer(stop_words="english")
vectorizetweet = vect.fit_transform(X.loc[:,'tweet_text']).toarray()

In [58]:
vectorizetweet.shape

(38268, 44004)

### model with tweet text and all other features

In [67]:
without_tweets = X.iloc[:,1:].values
X_concat = np.concatenate((vectorizetweet, without_tweets), axis=1)


In [68]:
X_concat.shape

(38268, 44010)

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X_concat, y, test_size=0.33, random_state=42)

In [70]:
X_train.shape

(25639, 44010)

In [71]:
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [72]:
clf.score(X_test,y_test)

0.8039433050914562

### model with only tweets text

In [103]:
X_train, X_test, y_train, y_test = train_test_split(vectorizetweet, y, test_size=0.33, random_state=42)
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(X_train, y_train)

print("Score",clf.score(X_test,y_test))
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

Score 0.9086230105313168


array([[9825,  335],
       [ 819, 1650]])

## New features + vectorized tweets

#### Create new features from tweets text

In [121]:
new_features = pd.DataFrame()

In [122]:
# no of hashtags
new_features['hashtags'] = X['tweet_text'].str.count('#')

In [123]:
#no of tag people
new_features['tagged'] = X['tweet_text'].str.count('@')

In [124]:
# length of tweet
new_features['length'] = X['tweet_text'].apply(lambda x: (len(x)))

In [125]:
new_features['exclaimation'] = X['tweet_text'].str.count('!')

In [126]:
X_with_new_features= np.concatenate((vectorizetweet, new_features.values), axis=1)


In [127]:
X_train, X_test, y_train, y_test = train_test_split(X_with_new_features, y, test_size=0.33, random_state=42)
clf = LogisticRegression().fit(X_train, y_train)
print("Score",clf.score(X_test,y_test))
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

Score 0.9049014173727136


array([[9849,  311],
       [ 890, 1579]])