In [4]:
import os
import re
import string
import warnings

import nltk
import numpy as np
import pandas as pd
from math import sqrt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore", category=FutureWarning)

In [11]:
# upload the datasets
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [12]:
# Train data inspection

train_df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [51]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2834 entries, 0 to 2833
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2834 non-null   object 
 1   url_legal       830 non-null    object 
 2   license         830 non-null    object 
 3   excerpt         2834 non-null   object 
 4   target          2834 non-null   float64
 5   standard_error  2834 non-null   float64
dtypes: float64(2), object(4)
memory usage: 133.0+ KB


In [52]:
train_df.describe()

Unnamed: 0,target,standard_error
count,2834.0,2834.0
mean,-0.959319,0.491435
std,1.033579,0.034818
min,-3.676268,0.0
25%,-1.69032,0.468543
50%,-0.91219,0.484721
75%,-0.20254,0.506268
max,1.71139,0.649671


In [27]:
# Test data inspection

test_df.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


In [32]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         7 non-null      object
 1   url_legal  3 non-null      object
 2   license    3 non-null      object
 3   excerpt    7 non-null      object
dtypes: object(4)
memory usage: 352.0+ bytes


In [54]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
    
def nlp_preprocessing(text):
    # lowercase
    text = text.lower()
    # remove digits
    text = re.sub(r'd+','', text)
    # remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    # remove spaces
    text = text.strip()
    # tokenize
    word_tokens = word_tokenize(text)
    # remove stopwords
    filtered_text = [word for word in word_tokens if word not in stop_words]
    text = ' '.join(str(elem) for elem in filtered_text)
    token_text = word_tokenize(text)
    for word in token_text:
        # lemmatization
        lemmatizer.lemmatize(word)
        # stemming
        stemmer.stem(word)
    return text

In [None]:
train_df['prepared_text'] = ""

for i in range(0, len(train_df)):
    train_df['prepared_text'][i] = nlp_preprocessing(train_df.excerpt[i])
    
train_df['prepared_text']

In [57]:
X = train_df.prepared_text
y = train_df.target

# split train dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, shuffle=False)

In [58]:
tfidf_vectorizer = TfidfVectorizer() 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

tfidf_train.shape, X_train.shape, tfidf_test.shape, X_test.shape

((2692, 27476), (2692,), (142, 27476), (142,))

In [59]:
regression = LinearRegression().fit(tfidf_train, y_train)

y_predict = regression.predict(tfidf_test)

In [60]:
rmse = sqrt(mean_squared_error(y_test, y_predict))
print(f"Root Mean Square: {rmse}")

Root Mean Square: 0.8161517154056934


In [61]:
test_df['prepared_text'] = ""
for i in range(0, len(test_df)):
    test_df['prepared_text'][i] = nlp_preprocessing(test_df.excerpt[i])

hope lay jacks promise woul keep bright light burning upper story guie course clear night light visible village somehow faile take account state weather air full eying flakes woul rener healight locomotive invisible hunre yars istant strange important fact never occurre fully fourth mile village looking vain beacon light anger situation struck halte certain go wrong sai power follow irect course without something serve compass go back village wait till morning
otty continue go mrs grays every night milk sometimes katie went always pause uner acorntree playe king queen otty sai wishe coul ever remember bring nipperkins case milk woul taste great eal like nectar nipperkins pair hanle cups chilren suppose silver always use table otty knew oing wrong every time playe king queen knew milk mrs grays still sai ruthie neent give much measure presse run queenie shoul rink great eal woul always quart left yes know woul mrs gray never sai anything milk merely poure pan gave back pail otty asking 

In [62]:
X_test = test_df.prepared_text

test_tfidf = tfidf_vectorizer.transform(X_test) 
test_tfidf.shape, tfidf_train.shape

((7, 27476), (2692, 27476))

In [63]:
y_predict = regression.predict(test_tfidf)

y_predict

array([-1.53936543, -0.32290477, -0.36056147, -1.70910902, -1.35027085,
       -0.93543477, -0.14130872])

In [54]:
results_df = pd.DataFrame()
results_df['id'] = test_df['id']
results_df['target'] = y_predict
results_df.reset_index(inplace=True,drop=True)
results_df.to_csv('submission.csv',index=False) 