In [1]:
import re 
import nltk
import string
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import numpy as np 
import pandas as pd

# Loading data 

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
df = train_df.copy()
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7613 non-null   int64 
 1   text    7613 non-null   object
 2   target  7613 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 178.6+ KB


In [4]:
df = df.drop(['keyword', 'location'], axis=1)
print("resulting dataframe shape")
print("columns :", df.columns)
print("shape:", df.shape)

resulting dataframe shape
columns : Index(['id', 'text', 'target'], dtype='object')
shape: (7613, 3)


In [5]:
# checking for null values 
df.isnull().sum()

id        0
text      0
target    0
dtype: int64

In [6]:
# checking for duplicated values 
df.duplicated().sum()

np.int64(0)

# preprocessing function 

In [7]:
def process_tweet(tweet):
    """ Process Tweet function
    input: 
        tweet : a string containig a tweet
    ouput:
        tweet_clean : list containig words of processed tweet
    """
    stemmer = PorterStemmer()
    stop_words = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks    
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtags
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True,
                              strip_handles=True)
    # converting sentence to tokens
    tokens = tokenizer.tokenize(tweet)
    tweet_clean = []
    for word in tokens:
        if word not in stop_words and word not in string.punctuation:
            tweet_clean.append(stemmer.stem(word))
    return tweet_clean

# Feature Extraction function 

In [8]:
def build_freqs(tweet, ys):
    """ Build Freqs Function 
    Input:
        tweet: a string containing a tweet
        ys: m x 1 array containig target label for each tweet
            (either 0 or 1)
    output:
        freqs: a dictionary mapping each (word, lable) pair with frequency 
    """
    yslist = np.squeeze(ys).tolist()
    freqs = {}
    for y, tweet in zip(ys, tweet):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [9]:
train_x = df['text'].tolist()
train_y = df['target']

In [10]:
train_y.shape

(7613,)

In [11]:
freqs = build_freqs(train_x, train_y)
len(freqs)

15911

In [13]:
def sigmoid(z):
    """ Sigmoid function 
    Input: 
        z: input(array or scaler)
    outupt:
        h: sigmoid of z
    """
    h = 1 / (1 + np.exp(-z))
    return h 

In [18]:
def gradientDescent(x, y, theta, alpha, num_iter):
    """ Gradient Descent function 
    Input:
        x: input array of shape (m, n+1)
        y: target lable of matrix x,
        theta: weight vector of dimension (n+1, 1)
        alpha: learning rate
        num_iter: number of iterations of model training
    Ouput:
        J: final cost
        theta: final weights after last iteration completion 
    """
    m = len(x)
    for i in range(0, num_iter):

        # get z, the dot product of x and theta
        z = np.dot(x, theta)
        
        # get sigmoid of z
        h = sigmoid(z)
        
        # compute cost 
        J = (-1/m)*np.sum(y * np.log(h) + (1-y) * np.log(1-h))

        # update the weights
        theta = theta - alpha* np.dot(x.T, (h - y)) / m

    J = float(J)
    return J, theta

In [88]:
# Check the function
# Construct a synthetic test case using numpy PRNG functions
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = gradientDescent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is 0.67094970.
The resulting vector of weights is [np.float64(4.1e-07), np.float64(0.00035658), np.float64(7.309e-05)]


In [87]:
def extract_features(tweet, freqs, process_tweet=process_tweet):
    """ Extract Featues function 
    Input: 
        tweet: string containing a tweet
        freqs: a dictionary mapping each (word, label) pair with frequency 
    Output:
        x: a feature vector of dimension(1, 3)
    """
    # process_tweet
    word_l = process_tweet(tweet)
    # 3 elements [bias, postitive, negative ] 
    x = np.zeros(3)

    #bias term is set to 1
    x[0] = 1

    # loop through each word in the list
    for word in word_l:

        #increment the word count for the label=1
        x[1] += freqs.get((word, 1), 0)

        # do the same for label=0
        x[2] += freqs.get((word, 0), 0)
    x = x[None, :]
    assert(x.shape == (1,3))
    return x

In [21]:
# Checking function
# test 1
# test on training data
tmp1 = extract_features(train_x[0], freqs)
print(tmp1)

[[  1. 162. 148.]]


In [22]:
# test 2:
# check for when the words are not in the freqs dictionary
tmp2 = extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

[[1. 0. 0.]]


# Training the model

In [24]:
train_y.shape

(7613,)

In [92]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = np.array(train_y)
Y = Y.reshape((len(train_y), 1))

# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-5, 500000)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.51663997.
The resulting vector of weights is [np.float64(-0.11642076), np.float64(0.00809115), np.float64(-0.00849864)]


In [93]:
def predict_tweet(tweet, freqs, theta):
    """ Predict Tweet function
    Input: 
        tweet: a string containing a tweet
        freqs: a dictionary mapping (word, label) with frequency
        theta: weights (3,1)
    Output:
        y_pred: probability value of a tweet being label=1 or lable=0
    """
    x = extract_features(tweet, freqs)

    # make prediction 
    y_pred = 1 / (1 + np.exp(- np.dot(x, theta)))
    
    return y_pred

In [94]:
my_tweet = train_x[500]
predict_tweet(my_tweet, freqs, theta)

array([[0.96507246]])

In [95]:
train_y[500]

np.int64(1)

In [96]:
test_tweet = test_df['text'].tolist()

y_hats = []
for tweet in test_tweet:
    # get the prediction 
    y_pred = predict_tweet(tweet, freqs, theta)

    if y_pred > 0.5:
        # append 1.0 
        y_hats.append(1)
    else:
        # append 0.0
        y_hats.append(0)
        


In [97]:
submission_df = pd.DataFrame({"id": test_df["id"], 
                             "target": y_hats})

In [98]:
submission_df.to_csv('log_reg.csv', index=False)