In [24]:
import pandas as pd
from pathlib import Path
import os
import re
from unicodedata import normalize
import string
import pickle as pkl

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [25]:
path = Path('.').parent.absolute()

full_train = os.path.join(path, 'raw-dataset', 'train.csv')
train_df = pd.read_csv(full_train, encoding='utf-8')

full_test = os.path.join(path, 'raw-dataset', 'test.csv')
test_df = pd.read_csv(full_test, encoding='utf-8')

# Preprocess Tweet Data

In [26]:
def cleaning(tweet_text, df):
    temp = []
    table = str.maketrans("", "", string.punctuation)
    for tweet in tweet_text:
        # Remove links
        tweet = re.sub(r"http\S+", "", tweet)
        # Remove newline
        tweet = tweet.strip('\n')
        # Remove unicode
        tweet = normalize('NFKD', tweet).encode('ascii','ignore')
        # Remove username
        tweet = re.sub('@[^\s]+','',str(tweet))
#         # Remove hashtag
#         tweet = re.sub('#[^\s]+', '', tweet)
        # Remove punctuation and change to lower case
        tweet = tweet.translate(table).lower()
        # Remove 'b' at the begining for binary
        tweet = tweet.replace('b', '', 1)
        # Remove whitespace at start of sentence
        tweet = tweet.strip()
        temp.append(tweet)
    try:
        # Concatenate training with target
        processed_tweets = pd.concat([pd.DataFrame(temp), df['target']], axis=1)
        processed_tweets = pd.DataFrame(processed_tweets)
    except KeyError:
        processed_tweets = pd.DataFrame(temp)
    print(processed_tweets)
    return processed_tweets   

In [27]:
# Preprocess training and testing tweets
processed_tr_tweets = cleaning(train_df['text'], train_df)
processed_tst_tweets = cleaning(test_df['text'], test_df)

                                                      0  target
0     our deeds are the reason of this earthquake ma...       1
1                 forest fire near la ronge sask canada       1
2     all residents asked to shelter in place are be...       1
3     13000 people receive wildfires evacuation orde...       1
4     just got sent this photo from ruby alaska as s...       1
...                                                 ...     ...
7608  two giant cranes holding a bridge collapse int...       1
7609  the out of control wild fires in california ev...       1
7610               m194 0104 utc5km s of volcano hawaii       1
7611  police investigating after an ebike collided w...       1
7612  the latest more homes razed by northern califo...       1

[7613 rows x 2 columns]
                                                      0
0                    just happened a terrible car crash
1     heard about earthquake is different cities sta...
2     there is a forest fire at spot po

# Tokenization

In [28]:
def vectorize_tweets(count_vect, data):
    vect_tweets = count_vect.fit_transform(data)
    vect_tweets = vect_tweets.toarray()
    return vect_tweets, count_vect

In [33]:
# Convert a collection of text documents to a matrix of token counts
count_vect = CountVectorizer(analyzer='word', lowercase=False, stop_words='english')
# Combine both train and test
# Prevent unequal length of variables after tokenization
combined_tr_tst = pd.concat([processed_tr_tweets[0], processed_tst_tweets[0]], axis=0)
combined_vect,_ = vectorize_tweets(count_vect, combined_tr_tst)

# Check length
len_tr = len(processed_tr_tweets[0])
print('Training length: %d' %len_tr)
len_tst = len(processed_tst_tweets[0])
print('Testing length: %d' %len_tst)
print('Length of train + test: %d' %len(combined_vect))

# Split back to train and test
vect_tweets = combined_vect[:len_tr]
vect_tst_tweets = combined_vect[len_tr:]

Training length: 7613
Testing length: 3263
Length of train + test: 10876


# Balancing Training and testing model data

In [None]:
# Checking number of tweets per target
real, fake, unknown = 0, 0, 0
real_tweet, fake_tweet = [], []
for i in range(len(processed_tr_tweets)):
    temp = processed_tr_tweets['target'][i]
    if temp == 1:
        real += 1
        real_tweet.append(processed_tr_tweets[0][i])
    elif temp == 0:
        fake += 1
        fake_tweet.append(processed_tr_tweets[0][i])
    else:
        unknown += 1
print(real, fake, unknown)
temp = pd.DataFrame(fake_tweet)[:real]
a = real_tweet + fake_tweet
print(len(a))
print(len(processed_tr_tweets[0]))

# Training and Evaluation

In [34]:
# Split training and testing
X_train, X_test, y_train, y_test  = train_test_split(
        vect_tweets, 
        processed_tr_tweets['target'],
        train_size=0.80, 
        random_state=True
)

In [35]:
# Train
model = LogisticRegression(solver='lbfgs', verbose=2)
model = model.fit(
    X=X_train, y=y_train,
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.8s finished


In [36]:
# Evaluate model
y_pred = model.predict(X_test)
print(len(y_pred))
print(accuracy_score(y_test, y_pred))

1523
0.804333552199606


# Predict new data

In [10]:
# Predict
new_prediction = model.predict(vect_tst_tweets)
new_prediction = pd.DataFrame(new_prediction)
new_prediction = pd.concat([test_df['id'], new_prediction], axis=1)
print(new_prediction)

         id  0
0         0  1
1         2  1
2         3  1
3         9  0
4        11  1
...     ... ..
3258  10861  1
3259  10865  1
3260  10868  1
3261  10874  1
3262  10875  0

[3263 rows x 2 columns]


In [12]:
def save_submission(new_prediction):
    new_prediction = new_prediction.rename({0: 'target'}, axis=1) 
    new_prediction.to_csv('submission2.csv', index=False)

save_submission(new_prediction)