In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
import gensim
import nltk
import spacy
import re
import spacy
import matplotlib.pyplot as plt

from ydata_profiling import ProfileReport
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from nltk import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from IPython.display import Image
from spacy import displacy
from transformers import pipeline
%matplotlib inline

In [None]:
promt_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
promt_train = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
summary_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')
summary_train = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')

In [None]:
print(summary_train.shape)
print(promt_train.shape)

In [None]:
print(summary_test.shape)
print(promt_test.shape)

In [None]:
summary_train.head()

In [None]:
promt_train

In [None]:
summary_train.text[0]

In [None]:
summary_train.isnull().sum() 

In [None]:
summary_test.isnull().sum()

- We have no missing values.

In [None]:
summary_train.dtypes

# Distributions of targets (Content / Wording Scores)

In [None]:
plt.hist(summary_train.content);

In [None]:
plt.hist(summary_train.wording);

In [None]:
summary_train.wording.describe()

In [None]:
summary_train.content.describe()

# Data Preparation

In [None]:
profile = ProfileReport(summary_train, title="Profiling Report")
profile

In [None]:
stop_words = stopwords.words('english')

def preprocess(text, join_back=True):
    result = list()
    for token in gensim.utils.simple_preprocess(text):
        if (token not in gensim.parsing.preprocessing.STOPWORDS and
           len(token) > 2 and token not in stop_words):
            result.append(token)
       
    if join_back:
        result = ' '.join(result)
    
    return result

In [None]:
summary_train['processed_summary'] = summary_train['text'].apply(preprocess)
promt_train['processed_promt'] = promt_train['prompt_text'].apply(preprocess)

In [None]:
summary_train['text'][0]

In [None]:
summary_train['processed_summary'][0]

In [None]:
plt.figure(figsize = (20,20)) 
plt.title("Original Text")
wc = WordCloud(max_words = 500 , width = 800 , height = 400).generate(" ".join(summary_train['text']))
plt.imshow(wc, interpolation = 'bilinear')

plt.figure(figsize = (20,20)) 
plt.title("Processed Text")
wc = WordCloud(max_words = 500 , width = 800 , height = 400).generate(" ".join(summary_train['processed_summary']))
plt.imshow(wc, interpolation = 'bilinear')

In [None]:
summary_train.head(2)

In [None]:
promt_train.head(2)

In [None]:
train = pd.merge(summary_train, promt_train, on='prompt_id')
train.head()

In [None]:
train.shape

In [None]:
train["n_symbols"] = train["processed_summary"].apply(len)
train["n_words"] = train["processed_summary"].apply(lambda x: len(nltk.tokenize.word_tokenize(x)))
train["n_sents"] = train["processed_summary"].apply(lambda x: len(nltk.tokenize.sent_tokenize(x)))

In [None]:
train.head(2)

In [None]:
train.n_sents.nunique()

- Let's get rid of n_sents column since it has only 1 unique value

In [None]:
train.drop(columns='n_sents', inplace=True)

In [None]:
plt.scatter(train.n_words, train.content)

In [None]:
for column in ["n_symbols", "n_words"]:
    plt.figure(figsize=(10,5))
    plt.title(f"Train {column}. Content score >0.5")
    train.loc[train["content"] > 0.5, column].hist(bins=30)
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(10,5))
    plt.title(f"Train {column}. Wording score >0.5")
    train.loc[train["wording"] > 0.5, column].hist(bins=30)
    plt.legend()
    plt.show()

- We can see some pattern for words/symbols. Most likely to get a score > 0.5 is when you have neither too little words/symbols, neither too many.

# Modeling experiments

In [None]:
target_content = train['content']
target_wording = train['wording']

In [None]:
target_content

## Creating TF-IDF features

In [None]:
tfidf_vectorizer_sum = TfidfVectorizer(max_features = 1000)
tfidf_vectorizer_promt = TfidfVectorizer(max_features = 1000)


summaries_train_tf = tfidf_vectorizer_sum.fit_transform(train['processed_summary'])
prompt_train_tf = tfidf_vectorizer_promt.fit_transform(train['processed_promt'])

In [None]:
summaries_train_tf

In [None]:
import scipy.sparse

features = scipy.sparse.hstack([prompt_train_tf, summaries_train_tf])

In [None]:
train.shape

In [None]:
features

In [None]:
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(
    features,
    target_content,
    test_size=.2,
    random_state=7)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=7)

rf.fit(X_train_train, y_train_train)

In [None]:
X_train_train_1, X_train_test_1, y_train_train_1, y_train_test_1 = train_test_split(
    features,
    target_wording,
    test_size=.2,
    random_state=7)

In [None]:
rf1 = RandomForestRegressor(n_estimators=100, random_state=7)



In [None]:
rf.fit(X_train_train, y_train_train)
rf1.fit(X_train_train_1, y_train_train_1)

# Predictions

In [None]:
summary_test

In [None]:
promt_test

In [None]:
summary_test['processed_summary'] = summary_test['text'].apply(preprocess)
promt_test['processed_promt'] = promt_test['prompt_text'].apply(preprocess)
test = pd.merge(summary_test, promt_test, on="prompt_id")

summary_test_tf = tfidf_vectorizer_sum.transform(test['processed_summary'])
promt_test_tf = tfidf_vectorizer_promt.transform(test['processed_promt'])

features_test = scipy.sparse.hstack([promt_test_tf, summary_test_tf])

In [None]:
summary_test

In [None]:
summary_test_tf

In [None]:
promt_test_tf

In [None]:
features_test

In [None]:
pred_content = rf.predict(features_test)
pred_wording = rf1.predict(features_test)

In [None]:
sample_subm = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv')
sample_subm.head()

In [None]:
results = pd.DataFrame({
    'student_id': test['student_id'],
    'content': pred_content,
    'wording': pred_wording
})

In [None]:
results.to_csv('submission.csv', index=False)