In [99]:
import pandas as pd
from pathlib import Path
import os
import re
from unicodedata import normalize
import string
import pickle as pkl

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [100]:
path = Path('.').parent.absolute()

full_train = os.path.join(path, 'raw-dataset', 'train.csv')
train_df = pd.read_csv(full_train, encoding='utf-8')

In [101]:
tweet_text = train_df['text']
clean_tweet = []
hashtags_list = []
for tweet in tweet_text:
    # Remove links
    tweet = re.sub(r"http\S+", "", tweet)
    # Remove newline
    tweet = tweet.strip('\n')
    # Remove unicode
    tweet = normalize('NFKD', tweet).encode('ascii','ignore')
    # Remove username
    tweet = re.sub('@[^\s]+','',str(tweet))
    clean_tweet.append(tweet)

    # Store and remove hashtags
    temp = re.findall('#[^\s]+', tweet)
    table = str.maketrans("", "", string.punctuation)
    temp = [s.translate(table) for s in temp]
    for hashtag in temp:
        hashtag = re.sub("#", "", hashtag)
        hashtag = hashtag.lower()
        if hashtag not in hashtags_list:
            hashtags_list.append(hashtag)
            
temp = []
for i in range(len(clean_tweet)):
    tweet = clean_tweet[i]
    tweet = re.sub('#[^\s]+', '', tweet)
    tweet = tweet.translate(table).lower()
    tweet = tweet.replace('b', '', 1)
    tweet = tweet.strip()
    temp.append(tweet)
processed_tweets = zip(temp, train_df['target'])
processed_tweets = pd.DataFrame(processed_tweets)
print(processed_tweets)

                                                      0  1
0     our deeds are the reason of this  may allah fo...  1
1                 forest fire near la ronge sask canada  1
2     all residents asked to shelter in place are be...  1
3     13000 people receive  evacuation orders in cal...  1
4     just got sent this photo from ruby  as smoke f...  1
...                                                 ... ..
7608  two giant cranes holding a bridge collapse int...  1
7609  the out of control wild fires in california ev...  1
7610               m194 0104 utc5km s of volcano hawaii  1
7611  police investigating after an ebike collided w...  1
7612  the latest more homes razed by northern califo...  1

[7613 rows x 2 columns]


In [102]:
# Checking number of tweets per target
real, fake, unknown = 0, 0, 0
real_tweet, fake_tweet = {}, {}
for i in range(len(processed_tweets)):
    temp = processed_tweets[1][i]
    if temp == 1:
        real += 1
    elif temp == 0:
        fake += 1
    else:
        unknown += 1
print(real, fake, unknown)

3271 4342 0


In [103]:
count_vect = CountVectorizer(analyzer='word', lowercase=False)
vect_tweets = count_vect.fit_transform(processed_tweets[0])
vect_tweets = vect_tweets.toarray()

In [104]:
X_train, X_test, y_train, y_test  = train_test_split(
        vect_tweets, 
        processed_tweets[1],
        train_size=0.80, 
        random_state=True
)

In [105]:
# Train
log_model = LogisticRegression(solver='lbfgs')
log_model = log_model.fit(X=X_train, y=y_train)

In [106]:
# Evaluate model
y_pred = log_model.predict(X_test)
print(len(y_pred))
print(accuracy_score(y_test, y_pred))

1523
0.7957977675640184


In [132]:
# Predict
full_test = os.path.join(path, 'raw-dataset', 'test.csv')
train_df = pd.read_csv(full_train, encoding='utf-8')
with open('test_tweet.pkl', 'r') as f:
    test_df = f.read()
test_df = [test_df]
test_df = count_vect.transform(test_df).toarray()
new_prediction = log_model.predict(test_df)
print(len(new_prediction))

1
