In [1]:
import pandas as pd
from pathlib import Path
import os
import re
from unicodedata import normalize
import string
import pickle as pkl

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
path = Path('.').parent.absolute()

full_train = os.path.join(path, 'raw-dataset', 'train.csv')
train_df = pd.read_csv(full_train, encoding='utf-8')

full_test = os.path.join(path, 'raw-dataset', 'test.csv')
test_df = pd.read_csv(full_test, encoding='utf-8')

In [3]:
def cleaning(tweet_text):
    clean_tweet = []
    hashtags_list = []
    for tweet in tweet_text:
        # Remove links
        tweet = re.sub(r"http\S+", "", tweet)
        # Remove newline
        tweet = tweet.strip('\n')
        # Remove unicode
        tweet = normalize('NFKD', tweet).encode('ascii','ignore')
        # Remove username
        tweet = re.sub('@[^\s]+','',str(tweet))
        clean_tweet.append(tweet)

    # Store and remove hashtags
    temp = re.findall('#[^\s]+', tweet)
    table = str.maketrans("", "", string.punctuation)
    temp = [s.translate(table) for s in temp]
    for hashtag in temp:
        hashtag = re.sub("#", "", hashtag)
        hashtag = hashtag.lower()
        if hashtag not in hashtags_list:
            hashtags_list.append(hashtag)
    
    return clean_tweet, hashtags_list

def more_cleaning(clean_tweet, df):
    temp = []
    table = str.maketrans("", "", string.punctuation)
    for i in range(len(clean_tweet)):
        tweet = clean_tweet[i]
        tweet = re.sub('#[^\s]+', '', tweet)
        tweet = tweet.translate(table).lower()
        tweet = tweet.replace('b', '', 1)
        tweet = tweet.strip()
        temp.append(tweet)
    try:
        processed_tweets = zip(temp, df['target'])
        processed_tweets = pd.DataFrame(processed_tweets)
    except KeyError:
        processed_tweets = temp
        processed_tweets = pd.DataFrame(processed_tweets)
    print(processed_tweets)
    return processed_tweets

In [4]:
tweet_text = train_df['text']
clean_tr_tweet, hashtags_tr_list = cleaning(tweet_text)
processed_tr_tweets = more_cleaning(clean_tr_tweet, train_df)

tweet_text = test_df['text']
clean_tst_tweet, hashtags_tst_list = cleaning(tweet_text)
processed_tst_tweets = more_cleaning(clean_tst_tweet, test_df)

                                                      0  1
0     our deeds are the reason of this  may allah fo...  1
1                 forest fire near la ronge sask canada  1
2     all residents asked to shelter in place are be...  1
3     13000 people receive  evacuation orders in cal...  1
4     just got sent this photo from ruby  as smoke f...  1
...                                                 ... ..
7608  two giant cranes holding a bridge collapse int...  1
7609  the out of control wild fires in california ev...  1
7610               m194 0104 utc5km s of volcano hawaii  1
7611  police investigating after an ebike collided w...  1
7612  the latest more homes razed by northern califo...  1

[7613 rows x 2 columns]
                                                      0
0                    just happened a terrible car crash
1     heard about  is different cities stay safe eve...
2     there is a forest fire at spot pond geese are ...
3                                   apocaly

In [5]:
# Checking number of tweets per target
real, fake, unknown = 0, 0, 0
real_tweet, fake_tweet = {}, {}
for i in range(len(processed_tr_tweets)):
    temp = processed_tr_tweets[1][i]
    if temp == 1:
        real += 1
    elif temp == 0:
        fake += 1
    else:
        unknown += 1
print(real, fake, unknown)

3271 4342 0


In [6]:
def vectorize_tweets(count_vect, data):
    vect_tweets = count_vect.fit_transform(data)
    vect_tweets = vect_tweets.toarray()
    return vect_tweets, count_vect

In [7]:
count_vect = CountVectorizer(analyzer='word', lowercase=False)

len_tr = len(processed_tr_tweets[0])
print('Training length: %d' %len_tr)
len_tst = len(processed_tst_tweets[0])
print('Testing length: %d' %len_tst)
combined_tr_tst = pd.concat([processed_tr_tweets[0], processed_tst_tweets[0]], axis=0)
combined_vect,_ = vectorize_tweets(count_vect, combined_tr_tst)
print('Length of train + test: %d' %len(combined_vect))

vect_tweets = combined_vect[:len_tr]
vect_tst_tweets = combined_vect[len_tr:]

# vect_tweets, count_vect = vectorize_tweets(count_vect, processed_tr_tweets[0])
# vect_tst_tweets,_ = vectorize_tweets(count_vect, processed_tst_tweets[0])

Training length: 7613
Testing length: 3263
Length of train + test: 10876


In [8]:
X_train, X_test, y_train, y_test  = train_test_split(
        vect_tweets, 
        processed_tr_tweets[1],
        train_size=0.80, 
        random_state=True
)

In [9]:
# Train
log_model = LogisticRegression(solver='lbfgs')
log_model = log_model.fit(X=X_train, y=y_train)

In [10]:
# Evaluate model
y_pred = log_model.predict(X_test)
print(len(y_pred))
print(accuracy_score(y_test, y_pred))

1523
0.7957977675640184


In [11]:
# Predict
new_prediction = log_model.predict(vect_tst_tweets)
new_prediction = pd.DataFrame(new_prediction)
new_prediction = pd.concat([test_df['id'], new_prediction], axis=1)
print(new_prediction)

         id  0
0         0  1
1         2  0
2         3  1
3         9  0
4        11  1
...     ... ..
3258  10861  1
3259  10865  1
3260  10868  1
3261  10874  1
3262  10875  0

[3263 rows x 2 columns]


In [12]:
confirmed = ['derailment', 'wreckage', 'debris']
for keyword in test_df['keyword']:
    temp = pd.isna(keyword)
    if temp is False and temp in confirmed:
        print(keyword)

In [13]:
new_prediction = new_prediction.rename({0: 'target'}, axis=1) 
new_prediction.to_csv('submission.csv', index=False)