In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the data
data = pd.read_csv("CommonLit_train.csv")
data.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [3]:
#Find the shape of the Data
print("SHAPE: ",data.shape)

print('*******************************')

# Checking for any missing values
print(data.isnull().any())

SHAPE:  (2834, 6)
*******************************
id                False
url_legal          True
license            True
excerpt           False
target            False
standard_error    False
dtype: bool


In [4]:
data.isnull().sum()/len(data)*100

id                 0.000000
url_legal         70.712773
license           70.712773
excerpt            0.000000
target             0.000000
standard_error     0.000000
dtype: float64

In [5]:
# dropping columns with more null values from the dataset
data = data.drop(['url_legal', 'license'], axis = 1)

# dropping ID & unwanted columns from the dataset
data = data.drop(['id','standard_error'], axis = 1)

data.head()

Unnamed: 0,excerpt,target
0,When the young people returned to the ballroom...,-0.340259
1,"All through dinner time, Mrs. Fayre was somewh...",-0.315372
2,"As Roger had predicted, the snow departed as q...",-0.580118
3,And outside before the palace a great garden w...,-1.054013
4,Once upon a time there were Three Bears who li...,0.247197


In [6]:
data.isnull().sum() / len(data) * 100

excerpt    0.0
target     0.0
dtype: float64

In [7]:
# Convert all upper case text to lower case text
data['excerpt'] = data['excerpt'].str.lower()
data.head()

Unnamed: 0,excerpt,target
0,when the young people returned to the ballroom...,-0.340259
1,"all through dinner time, mrs. fayre was somewh...",-0.315372
2,"as roger had predicted, the snow departed as q...",-0.580118
3,and outside before the palace a great garden w...,-1.054013
4,once upon a time there were three bears who li...,0.247197


In [8]:
# Remove all the punctuations
import string
eng_punct = string.punctuation

def remove_punc(text):
    translator = str.maketrans('', '', eng_punct)
    return text.translate(translator)

data['excerpt'] = data['excerpt'].apply(lambda x: remove_punc(x))
data.head()

Unnamed: 0,excerpt,target
0,when the young people returned to the ballroom...,-0.340259
1,all through dinner time mrs fayre was somewhat...,-0.315372
2,as roger had predicted the snow departed as qu...,-0.580118
3,and outside before the palace a great garden w...,-1.054013
4,once upon a time there were three bears who li...,0.247197


In [9]:
# Removing Numbers from the text column
import re
def remove_nums(text):
    return re.sub('[0-9]+', "", text)

data['excerpt'] = data['excerpt'].apply(lambda x: remove_nums(x))
data.head()

Unnamed: 0,excerpt,target
0,when the young people returned to the ballroom...,-0.340259
1,all through dinner time mrs fayre was somewhat...,-0.315372
2,as roger had predicted the snow departed as qu...,-0.580118
3,and outside before the palace a great garden w...,-1.054013
4,once upon a time there were three bears who li...,0.247197


In [10]:
# Stemming on Dataset
from nltk.stem import PorterStemmer
pst = PorterStemmer()

def stemming_on_text(data):
    text = [pst.stem(data) for word in data]
    return data

data['excerpt'] = data['excerpt'].apply(lambda x: stemming_on_text(x))
data.head()

Unnamed: 0,excerpt,target
0,when the young people returned to the ballroom...,-0.340259
1,all through dinner time mrs fayre was somewhat...,-0.315372
2,as roger had predicted the snow departed as qu...,-0.580118
3,and outside before the palace a great garden w...,-1.054013
4,once upon a time there were three bears who li...,0.247197


In [11]:
# Lemmatization on Dataset
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

def lemmatizer_on_text(data):
    text = [wnl.lemmatize(word) for word in data]
    return data

data['excerpt'] = data['excerpt'].apply(lambda x: lemmatizer_on_text(x))
data.head()

Unnamed: 0,excerpt,target
0,when the young people returned to the ballroom...,-0.340259
1,all through dinner time mrs fayre was somewhat...,-0.315372
2,as roger had predicted the snow departed as qu...,-0.580118
3,and outside before the palace a great garden w...,-1.054013
4,once upon a time there were three bears who li...,0.247197


In [12]:
# Word TOkenization for the text column
from nltk import word_tokenize

def tk_on_text(data):
    text = [word_tokenize(word) for word in data]
    return data

data['excerpt'] = data['excerpt'].apply(lambda x: tk_on_text(x))
data.head()

Unnamed: 0,excerpt,target
0,when the young people returned to the ballroom...,-0.340259
1,all through dinner time mrs fayre was somewhat...,-0.315372
2,as roger had predicted the snow departed as qu...,-0.580118
3,and outside before the palace a great garden w...,-1.054013
4,once upon a time there were three bears who li...,0.247197


In [13]:
# Removing stop words from the text column
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])


data['excerpt'] = data['excerpt'].apply(remove_stopwords)
data.head()

Unnamed: 0,excerpt,target
0,young people returned ballroom presented decid...,-0.340259
1,dinner time mrs fayre somewhat silent eyes res...,-0.315372
2,roger predicted snow departed quickly came two...,-0.580118
3,outside palace great garden walled round fille...,-1.054013
4,upon time three bears lived together house woo...,0.247197


In [14]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,excerpt
2743,building rotary presses printing illustrated p...
2347,idea trip bobs yacht suited everybody decided ...
2387,seeing front door wide open enchanter walked q...
2202,widow cried called poor lost lamb called lot n...
786,jacobitism much smaller extent political movem...
...,...
1638,steam supplied two circular return tube boiler...
1095,living things different things alive usually e...
1130,id always longed adventures see life dreadful ...
1294,times one dread lies heavy heart brain—the tho...


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv

cv.fit(X_train['excerpt'])

X_train = cv.transform(X_train['excerpt'])
X_test = cv.transform(X_test['excerpt'])
X_train

<2267x25496 sparse matrix of type '<class 'numpy.int64'>'
	with 160874 stored elements in Compressed Sparse Row format>

In [16]:
# Build and train the model
from sklearn.svm import SVR
lr = SVR(C=1.0, epsilon=0.2)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

In [17]:
# Evaluate the model
from sklearn.metrics import r2_score
accuracy = r2_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.44413460364962876
