In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/commonlitreadabilityprize/sample_submission.csv
/kaggle/input/commonlitreadabilityprize/train.csv
/kaggle/input/commonlitreadabilityprize/test.csv


In [2]:
import pandas as pd
import numpy as np

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.collections import *
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_squared_error

In [5]:
df_train = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
df_test = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [6]:
df_train.shape

(2834, 6)

In [7]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return nopunc

In [8]:
df_train["excerpt"] = df_train["excerpt"].apply(text_process)
df_test["excerpt"] = df_test["excerpt"].apply(text_process)


In [9]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('bow', CountVectorizer(ngram_range=(1,3),
                           stop_words="english")),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', Ridge(alpha=0.5)),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [10]:
X = df_train["excerpt"]

y = df_train["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [11]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2550,), (284,), (2550,), (284,))

In [12]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('bow',
                 CountVectorizer(ngram_range=(1, 3), stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('classifier', Ridge(alpha=0.5))])

In [13]:
predictions = pipeline.predict(X_test)

In [14]:
#print(pipeline.score(X_train, y_train))

In [15]:
print(np.sqrt(mean_squared_error(y_test, predictions)))

0.7668134540882492


In [16]:
predictions

array([ 0.18235437, -1.90161536, -0.61569516, -0.49182252, -0.3291093 ,
       -1.77733105, -1.7268854 , -1.35360166, -1.97367766,  0.05448817,
        0.01471592, -1.25706153, -0.52223421, -0.70485581,  0.12682376,
       -0.88909039,  0.24095612, -1.34394204,  0.1337548 , -0.61097558,
        0.65418379, -1.24481887, -1.75326462,  0.13071383, -0.97870299,
       -1.96243829, -0.2671039 ,  0.43919021, -1.07990909, -1.19747725,
       -1.48669104, -1.899635  , -0.05450516, -0.73903724,  0.84005697,
       -0.8083583 , -0.22357875, -0.0521453 , -2.1669264 , -0.85100384,
        0.36831876,  0.6900844 , -0.37963526, -1.14824885, -1.47166273,
       -1.35623819, -1.63314374, -0.62580255, -1.04238985, -0.61573494,
        0.21911429,  1.14297953, -0.07866103, -0.91223436, -0.92015457,
       -0.92583284, -2.12464635, -1.1854055 , -0.33143113, -1.83299695,
       -1.81784031, -0.92763888, -0.7578547 , -0.97947562, -0.15529837,
        0.55252706,  1.02488414,  0.44702543, -0.33015361, -1.85

In [17]:
predictions = pipeline.predict(df_test["excerpt"])

In [18]:
submission["target"]=predictions

In [19]:
df_test

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jacks promise that he would kee...
1,f0953f0a5,,,Dotty continued to go to Mrs Grays every night...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...
5,12537fe78,,,To explain transitivity let us look first at a...
6,965e592c0,https://www.africanstorybook.org/#,CC BY 4.0,Milka and John are playing in the garden Her l...


In [20]:
submission.set_index("id", inplace=True)

In [21]:
submission

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
c0f722661,-1.459372
f0953f0a5,-0.084465
0df072751,-0.36634
04caf4e0c,-1.635389
0e63f8bea,-1.423051
12537fe78,-0.897324
965e592c0,0.347013


In [22]:
submission.to_csv("submission.csv")