In [41]:
from typing import List
import numpy as np
import pandas as pd
from tqdm import tqdm


import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
tqdm.pandas()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gyg_9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gyg_9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [42]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [43]:
DATA_DIR = "./CommonLit_data/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

In [44]:
class Preprocessor:
    def __init__(self, 
                # model_name: str,
                ) -> None:
        # self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        )
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        
        # =========== length ratio 없애기 ================
        # input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor()

In [45]:
# train = preprocessor.run(prompts_train, summaries_train, mode="train")
DATA_DIR = "./CommonLit_data/"
train = pd.read_csv(DATA_DIR + "train_preprocessed.csv")
train = train.drop(columns=['Unnamed: 0'])

test = preprocessor.run(prompts_test, summaries_test, mode="test")

100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<00:00, 4013.69it/s]
100%|██████████| 4/4 [00:00<00:00, 4011.77it/s]
100%|██████████| 4/4 [00:00<00:00, 4013.69it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]


In [46]:
train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,5,Summarize how the Third Wave developed over su...,The Third Wave,Background [PARAGRAPH] The Third Wave experime...,678,14,4,0.063492,0,0.0,0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1088,18,22,0.415094,10,0.192308,0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,"In Egypt, there were many occupations and soci...",32,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,673,22,52,0.19403,22,0.082397,2
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,28,The highest class was Pharaohs these people we...,5,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,673,6,6,0.222222,5,0.192308,0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,232,The Third Wave developed rapidly because the ...,29,Summarize how the Third Wave developed over su...,The Third Wave,Background [PARAGRAPH] The Third Wave experime...,678,23,27,0.116883,5,0.021739,4


In [47]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7165 entries, 0 to 7164
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   student_id             7165 non-null   object 
 1   prompt_id              7165 non-null   object 
 2   text                   7165 non-null   object 
 3   content                7165 non-null   float64
 4   wording                7165 non-null   float64
 5   summary_length         7165 non-null   int64  
 6   fixed_summary_text     7165 non-null   object 
 7   splling_err_num        7165 non-null   int64  
 8   prompt_question        7165 non-null   object 
 9   prompt_title           7165 non-null   object 
 10  prompt_text            7165 non-null   object 
 11  prompt_length          7165 non-null   int64  
 12  word_overlap_count     7165 non-null   int64  
 13  bigram_overlap_count   7165 non-null   int64  
 14  bigram_overlap_ratio   7165 non-null   float64
 15  trig

In [48]:
train.keys()

Index(['student_id', 'prompt_id', 'text', 'content', 'wording',
       'summary_length', 'fixed_summary_text', 'splling_err_num',
       'prompt_question', 'prompt_title', 'prompt_text', 'prompt_length',
       'word_overlap_count', 'bigram_overlap_count', 'bigram_overlap_ratio',
       'trigram_overlap_count', 'trigram_overlap_ratio', 'quotes_count'],
      dtype='object')

In [49]:
targets = ["content", "wording"]

drop_columns = ["student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text",
               ] + targets

In [50]:
train_input = train.drop(columns=drop_columns)
train_target = train[['content', 'wording']]

In [51]:
train_input.head()

Unnamed: 0,summary_length,splling_err_num,prompt_length,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count
0,64,5,678,14,4,0.063492,0,0.0,0
1,54,2,1088,18,22,0.415094,10,0.192308,0
2,269,32,673,22,52,0.19403,22,0.082397,2
3,28,5,673,6,6,0.222222,5,0.192308,0
4,232,29,678,23,27,0.116883,5,0.021739,4


In [52]:
train_target.head()

Unnamed: 0,content,wording
0,0.205683,0.380538
1,-0.548304,0.506755
2,3.128928,4.231226
3,-0.210614,-0.471415
4,3.272894,3.219757


In [53]:
# apply random forest
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=1000, max_depth=10, random_state=42)
rf.fit(train_input, train_target)

print(rf.score(train_input, train_target))

0.8408261340890211


In [54]:
# feature의 중요도 출력
feature_names = train_input.columns


for importance, name in sorted(zip(rf.feature_importances_, feature_names), reverse=True):
    print(f"{name}: {importance}")

summary_length: 0.6749631144900063
bigram_overlap_ratio: 0.16619772998504773
trigram_overlap_ratio: 0.0557462552494045
prompt_length: 0.03552378930680702
splling_err_num: 0.01992874810410929
word_overlap_count: 0.01869494741425063
bigram_overlap_count: 0.014540774200689261
trigram_overlap_count: 0.010246724169548499
quotes_count: 0.004157917080136714


In [55]:
predictions = rf.predict(train_input)
predictions

array([[-0.16369141,  0.39732165],
       [-0.22749815, -0.14740285],
       [ 2.7363721 ,  2.91301648],
       ...,
       [-0.73922489, -0.35340737],
       [-0.17629303,  0.24444275],
       [ 1.09059234,  0.81154484]])

In [56]:
test_drop_columns = ["student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text",
               ]

In [57]:
test_ = test.drop(columns=test_drop_columns)

In [58]:
test_.head()

Unnamed: 0,summary_length,splling_err_num,prompt_length,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count
0,3,0,3,0,0,0.0,0,0.0,0
1,3,0,3,0,0,0.0,0,0.0,0
2,3,0,3,0,0,0.0,0,0.0,0
3,3,0,3,0,0,0.0,0,0.0,0


In [59]:
test_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   summary_length         4 non-null      int64  
 1   splling_err_num        4 non-null      int64  
 2   prompt_length          4 non-null      int64  
 3   word_overlap_count     4 non-null      int64  
 4   bigram_overlap_count   4 non-null      int64  
 5   bigram_overlap_ratio   4 non-null      float64
 6   trigram_overlap_count  4 non-null      int64  
 7   trigram_overlap_ratio  4 non-null      float64
 8   quotes_count           4 non-null      int64  
dtypes: float64(2), int64(7)
memory usage: 416.0 bytes


In [60]:
test_predictions = rf.predict(test_)

In [61]:
test['content'] = test_predictions[:, 0]
test['wording'] = test_predictions[:, 1]

In [62]:
test.head()

Unnamed: 0,student_id,prompt_id,text,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count,content,wording
0,000000ffffff,abc123,Example text 1,3,Example text 1,0,Summarize...,Example Title 1,Heading\r\nText...,3,0,0,0.0,0,0.0,0,-1.37831,-1.167186
1,111111eeeeee,def789,Example text 2,3,Example text 2,0,Summarize...,Example Title 2,Heading\r\nText...,3,0,0,0.0,0,0.0,0,-1.37831,-1.167186
2,222222cccccc,abc123,Example text 3,3,Example text 3,0,Summarize...,Example Title 1,Heading\r\nText...,3,0,0,0.0,0,0.0,0,-1.37831,-1.167186
3,333333dddddd,def789,Example text 4,3,Example text 4,0,Summarize...,Example Title 2,Heading\r\nText...,3,0,0,0.0,0,0.0,0,-1.37831,-1.167186


In [63]:
test = test[["student_id", "content", "wording"]]
test.head()

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.37831,-1.167186
1,111111eeeeee,-1.37831,-1.167186
2,222222cccccc,-1.37831,-1.167186
3,333333dddddd,-1.37831,-1.167186


In [64]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)