In [5]:
import os
import re
import string
import warnings

import nltk
import numpy as np
import pandas as pd
from math import sqrt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore", category=FutureWarning)

In [48]:
# upload the datasets
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [50]:
# Train data inspection

train_df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [51]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2834 entries, 0 to 2833
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2834 non-null   object 
 1   url_legal       830 non-null    object 
 2   license         830 non-null    object 
 3   excerpt         2834 non-null   object 
 4   target          2834 non-null   float64
 5   standard_error  2834 non-null   float64
dtypes: float64(2), object(4)
memory usage: 133.0+ KB


In [52]:
train_df.describe()

Unnamed: 0,target,standard_error
count,2834.0,2834.0
mean,-0.959319,0.491435
std,1.033579,0.034818
min,-3.676268,0.0
25%,-1.69032,0.468543
50%,-0.91219,0.484721
75%,-0.20254,0.506268
max,1.71139,0.649671


In [27]:
# Test data inspection

test_df.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


In [32]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         7 non-null      object
 1   url_legal  3 non-null      object
 2   license    3 non-null      object
 3   excerpt    7 non-null      object
dtypes: object(4)
memory usage: 352.0+ bytes


In [20]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
    
def nlp_preprocessing(text):
    # lowercase
    text = text.lower()
    # remove digits
    text = re.sub(r'd+','', text)
    # remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    # remove spaces
    text = text.strip()
    # tokenize
    word_tokens = word_tokenize(text)
    # remove stopwords
    filtered_text = [word for word in word_tokens if word not in stop_words]
    text = ' '.join(str(elem) for elem in filtered_text)
    # stemming
    token_text = word_tokenize(text)
    for word in token_text:
        text = f"{text}  {stemmer.stem(word)}"
    # lemmatization
    input_text = word_tokenize(text)
    for word in input_text:
        text = f"{text}{lemmatizer.lemmatize(word)}"
    return text

In [25]:
train_df['prepared_text'] = ""

for i in range(0, len(train_df)):
    train_df['prepared_text'][i] = nlp_preprocessing(train_df.excerpt[i])
    
train_df['prepared_text']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['prepared_text'][i] = nlp_preprocessing(train_df.excerpt[i])


0       young people returne ballroom presente eciely ...
1       inner time mrs fayre somewhat silent eyes rest...
2       roger ha preicte snow eparte quickly came two ...
3       outsie palace great garen walle roun fille ful...
4       upon time three bears live together house woo ...
                              ...                        
2829    think inosaurs live picture see hot steamy swa...
2830    soli solis usually har molecules packe togethe...
2831    secon state matter iscuss liqui solis har thin...
2832    solis shapes actually touch three imensions me...
2833    animals mae many cells eat things igest insie ...
Name: prepared_text, Length: 2834, dtype: object

In [34]:
X = train_df.prepared_text
y = train_df.target

# split train dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, shuffle=False)

In [36]:
tfidf_vectorizer = TfidfVectorizer() 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

tfidf_train.shape, X_train.shape, tfidf_test.shape, X_test.shape

((2692, 40548), (2692,), (142, 40548), (142,))

In [39]:
regression = LinearRegression().fit(tfidf_train, y_train)

y_predict = regression.predict(tfidf_test)

In [42]:
rmse = sqrt(mean_squared_error(y_test, y_predict))
print(f"Root Mean Square: {rmse}")

Root Mean Square: 0.8269815161099475


In [44]:
test_df['prepared_text'] = ""
for i in range(0, len(test_df)):
    test_df['prepared_text'][i] = nlp_preprocessing(test_df.excerpt[i])

In [45]:
X_test = test_df.prepared_text

test_tfidf = tfidf_vectorizer.transform(X_test) 
test_tfidf.shape, tfidf_train.shape

((7, 40548), (2692, 40548))

In [58]:
y_predict = regression.predict(test_tfidf)

y_predict

array([-1.69977255, -0.3324573 ,  0.03656068, -1.73883711, -1.30603957,
       -0.97176293,  0.50918833])

In [54]:
results_df = pd.DataFrame()
results_df['id'] = test_df['id']
results_df['target'] = y_predict
results_df.reset_index(inplace=True,drop=True)
results_df.to_csv('submission.csv',index=False) 