In [1]:
import pandas as pd
import numpy as np

import stanza
from nltk import word_tokenize

In [2]:
%load_ext autoreload

%autoreload 2

In [3]:
import os
import sys
path = os.path.abspath('..')
sys.path.append(path)

from essay_grader.lemmatize import lemmatize
from essay_grader.punctuation import clean_string
from essay_grader.bibliography import remove_all
from essay_grader.vectorize import doc2vec_vectorize
from essay_grader.text_feature import gen_text_feature

In [4]:
df = pd.read_csv("../essay_grader/data/essay_data.csv")

In [5]:
df = remove_all(df)
df = clean_string(df,column='text')

In [6]:
df.head()

Unnamed: 0,text,year,name,title,score,level,title_name
0,the question is asking that in the same discip...,2017,"4, 5 - Est_Chen-fzn235-TOK_essay.docx",5,4,2,"Given access to the same facts, how is it poss..."
1,our brains seek coherence structure and order ...,2017,"7, 6 - Eva GuoTOK_final_final_draft.docx",6,7,4,Humans are pattern-seeking animals and we are ...
2,in american heritage dictionary of the english...,2017,"7, 5 - fzn260_Yessica_Ji_Yuanyi_G12-9_TOKEssay...",5,7,4,"Given access to the same facts, how is it poss..."
3,the statement in the prompt argues that diffic...,2017,"8, 1 - James Li TOK_Essay_4th_draft.docx",1,8,4,It is only knowledge produced with difficulty ...
4,human are patternseeking animals because patte...,2017,"7, 6 - Fzn323_Amy_Wang_Qiaohui_G12_TOK_Essay_D...",6,7,4,Humans are pattern-seeking animals and we are ...


In [7]:
def vocab_richness(text):
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    return unique_word_length/total_length

df['vocab_richness'] = df.text.apply(vocab_richness)

In [8]:
df.groupby("level")['vocab_richness'].mean()

level
1    0.311507
2    0.309585
3    0.309794
4    0.314469
5    0.320593
Name: vocab_richness, dtype: float64

In [9]:
df.groupby("level")['vocab_richness'].std()

level
1    0.035250
2    0.039863
3    0.033980
4    0.034732
5    0.034370
Name: vocab_richness, dtype: float64

In [10]:
def syllables(word):
    syllable_count = 0
    vowels = 'aeiouy'
    if word[0] in vowels:
        syllable_count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            syllable_count += 1
    if word.endswith('e'):
        syllable_count -= 1
    if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
        syllable_count += 1
    if syllable_count == 0:
        syllable_count += 1
    return syllable_count

def mean_word_syllable(text):
    tokens = word_tokenize(text)
    syl = 0
    for token in tokens:
        syl += syllables(token)
    total_length = len(tokens)
    return syl/total_length

In [11]:
df['meanWordSyllable'] = df.text.apply(mean_word_syllable)

df.head()

Unnamed: 0,text,year,name,title,score,level,title_name,vocab_richness,meanWordSyllable
0,the question is asking that in the same discip...,2017,"4, 5 - Est_Chen-fzn235-TOK_essay.docx",5,4,2,"Given access to the same facts, how is it poss...",0.20936,1.540025
1,our brains seek coherence structure and order ...,2017,"7, 6 - Eva GuoTOK_final_final_draft.docx",6,7,4,Humans are pattern-seeking animals and we are ...,0.344933,1.73138
2,in american heritage dictionary of the english...,2017,"7, 5 - fzn260_Yessica_Ji_Yuanyi_G12-9_TOKEssay...",5,7,4,"Given access to the same facts, how is it poss...",0.398082,1.861711
3,the statement in the prompt argues that diffic...,2017,"8, 1 - James Li TOK_Essay_4th_draft.docx",1,8,4,It is only knowledge produced with difficulty ...,0.386667,1.779394
4,human are patternseeking animals because patte...,2017,"7, 6 - Fzn323_Amy_Wang_Qiaohui_G12_TOK_Essay_D...",6,7,4,Humans are pattern-seeking animals and we are ...,0.254939,1.554493


In [12]:
df.groupby("level")['meanWordSyllable'].mean()

level
1    1.663672
2    1.691718
3    1.713382
4    1.737963
5    1.715316
Name: meanWordSyllable, dtype: float64

In [13]:
df.groupby("level")['meanWordSyllable'].std()

level
1    0.096444
2    0.089836
3    0.082089
4    0.089888
5    0.061876
Name: meanWordSyllable, dtype: float64

In [16]:
import spacy 

# Splits the text into sentences, using  
# Spacy's sentence segmentation which can  
# be found at https://spacy.io/usage/spacy-101 
nlp = spacy.load('en') 

def break_sentences(text, nlp): 
    doc = nlp(text) 
    return doc.sents 
  
# Returns Number of Words in the text 
# def word_count(text): 
#     sentences = break_sentences(text,nlp) 
#     words = 0
#     for sentence in sentences: 
#         words += len([token for token in sentence]) 
#     return words 

def word_count(text):
    return len(text.split(" "))


# Returns the number of sentences in the text 
# def sentence_count(text): 
#     sentences = break_sentences(text,nlp) 
#     return len(list(sentences))
def sentence_count(text):
    return len(text.split("."))
  
# Returns average sentence length 
def avg_sentence_length(text): 
    words = word_count(text) 
    sentences = sentence_count(text) 
    average_sentence_length = float(words / sentences) 
    return average_sentence_length

In [17]:
df['word_count'] = df.text.apply(word_count)
df['sentence_count'] = df.text.apply(sentence_count)
df['avg_sentence_length'] = df.text.apply(avg_sentence_length)

In [18]:
df.groupby("level")['word_count'].mean()

level
1    1492.632653
2    1485.168182
3    1531.264151
4    1536.000000
5    1554.800000
Name: word_count, dtype: float64

In [19]:
df.groupby("level")['word_count'].std()

level
1    107.160257
2    125.355582
3     77.387113
4     77.382784
5     50.534147
Name: word_count, dtype: float64

In [20]:
df.groupby("level")['sentence_count'].mean()

level
1    71.673469
2    68.550000
3    67.905660
4    67.515625
5    61.200000
Name: sentence_count, dtype: float64

In [21]:
df.groupby("level")["avg_sentence_length"].mean()

level
1    21.368343
2    22.235258
3    23.055620
4    23.388957
5    26.369376
Name: avg_sentence_length, dtype: float64

In [22]:
df.groupby("level")["avg_sentence_length"].std()

level
1    3.613049
2    4.015905
3    3.390360
4    3.931143
5    5.266054
Name: avg_sentence_length, dtype: float64

### Count stop word

In [23]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def count_stopwords(x):
    word_tokens = word_tokenize(x)
    s = len([w for w in word_tokens if w in stop_words])
    return s

In [24]:
df['count_stopwords'] = df.text.apply(count_stopwords)

In [25]:
df.groupby("level")['count_stopwords'].mean()

level
1    690.510204
2    678.327273
3    691.220126
4    691.421875
5    698.200000
Name: count_stopwords, dtype: float64

In [26]:
df.groupby("level")['count_stopwords'].std()

level
1    65.167452
2    66.164885
3    48.100850
4    58.723405
5    57.590798
Name: count_stopwords, dtype: float64

## check the readability

In [27]:
# pip install textstat

- A higher score in Flesch’s reading ease test indicates material that is easier to read; lower numbers mark passages that are more difficult to read. 

In [28]:
from textstat.textstat import textstatistics, legacy_round 


def flesch_reading_ease(text): 
    """ 
        Implements Flesch Formula: 
        Reading Ease score = 206.835 - (1.015 × ASL) - (84.6 × ASW) 
        Here, 
          ASL = average sentence length (number of words  
                divided by number of sentences) 
          ASW = average word length in syllables (number of syllables  
                divided by number of words) 
    """
    FRE = 206.835 - float(1.015 * avg_sentence_length(text)) -\
          float(84.6 * mean_word_syllable(text)) 
    return legacy_round(FRE, 2) 

In [29]:
df['reading_ease'] = df.text.apply(flesch_reading_ease)

In [30]:
df.groupby("level")["reading_ease"].mean()

level
1    44.399184
2    41.146864
3    38.481509
4    36.063594
5    34.952000
Name: reading_ease, dtype: float64

In [31]:
df.groupby("level")["reading_ease"].std()

level
1    9.141040
2    9.460359
3    8.251395
4    9.652404
5    9.682183
Name: reading_ease, dtype: float64

In [32]:
df.head()

Unnamed: 0,text,year,name,title,score,level,title_name,vocab_richness,meanWordSyllable,word_count,sentence_count,avg_sentence_length,count_stopwords,reading_ease
0,the question is asking that in the same discip...,2017,"4, 5 - Est_Chen-fzn235-TOK_essay.docx",5,4,2,"Given access to the same facts, how is it poss...",0.20936,1.540025,1525,94,16.223404,788,60.08
1,our brains seek coherence structure and order ...,2017,"7, 6 - Eva GuoTOK_final_final_draft.docx",6,7,4,Humans are pattern-seeking animals and we are ...,0.344933,1.73138,1575,58,27.155172,724,32.8
2,in american heritage dictionary of the english...,2017,"7, 5 - fzn260_Yessica_Ji_Yuanyi_G12-9_TOKEssay...",5,7,4,"Given access to the same facts, how is it poss...",0.398082,1.861711,1208,41,29.463415,528,19.43
3,the statement in the prompt argues that diffic...,2017,"8, 1 - James Li TOK_Essay_4th_draft.docx",1,8,4,It is only knowledge produced with difficulty ...,0.386667,1.779394,1594,60,26.566667,732,29.33
4,human are patternseeking animals because patte...,2017,"7, 6 - Fzn323_Amy_Wang_Qiaohui_G12_TOK_Essay_D...",6,7,4,Humans are pattern-seeking animals and we are ...,0.254939,1.554493,1500,67,22.38806,703,52.6


In [None]:
# df.to_pickle('../essay_grader/pickle_data/df.pkl')

In [4]:
import os
import sys
path = os.path.abspath('..')
sys.path.append(path)

from essay_grader.lemmatize import lemmatize

In [2]:
import pickle
with open("../essay_grader/pickle_data/df.pkl", "rb") as file:
        df = pickle.load(file)

In [5]:
df = lemmatize(df,"text")

2020-08-21 09:36:57 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2020-08-21 09:36:57 INFO: Use device: cpu
2020-08-21 09:36:57 INFO: Loading: tokenize
2020-08-21 09:36:57 INFO: Loading: pos
2020-08-21 09:36:58 INFO: Loading: lemma
2020-08-21 09:36:58 INFO: Done loading processors!


In [6]:
df.head(3)

Unnamed: 0,text,year,name,title,score,level,title_name,vocab_richness,meanWordSyllable,word_count,sentence_count,avg_sentence_length,count_stopwords,reading_ease,lemmatized_text
0,the question is asking that in the same discip...,2017,"4, 5 - Est_Chen-fzn235-TOK_essay.docx",5,4,2,"Given access to the same facts, how is it poss...",0.20936,1.540025,1525,94,16.223404,788,60.08,"[[the, question, be, ask, that, in, the, same,..."
1,our brains seek coherence structure and order ...,2017,"7, 6 - Eva GuoTOK_final_final_draft.docx",6,7,4,Humans are pattern-seeking animals and we are ...,0.344933,1.73138,1575,58,27.155172,724,32.8,"[[we, brain, seek, coherence, structure, and, ..."
2,in american heritage dictionary of the english...,2017,"7, 5 - fzn260_Yessica_Ji_Yuanyi_G12-9_TOKEssay...",5,7,4,"Given access to the same facts, how is it poss...",0.398082,1.861711,1208,41,29.463415,528,19.43,"[[in, american, heritage, dictionary, of, the,..."


In [7]:
# df.to_pickle('../essay_grader/pickle_data/lemma_df.pkl')

In [22]:
import os
import sys
path = os.path.abspath('..')
sys.path.append(path)

from essay_grader.text_feature import gen_text_feature

In [23]:
df.sample()

Unnamed: 0,text,year,name,title,score,level,title_name
30,The claim suggests that our degree of apprecia...,2017,"4, 1 - FZN289_Nick_Zhang_Zexun_G12-8_TOKEssay_...",1,4,2,It is only knowledge produced with difficulty ...


In [24]:
df1 = gen_text_feature(df)
df1.head()

Unnamed: 0,text,year,name,title,score,level,title_name,vocab_richness,mean_word_syllable,word_count,sentence_count,avg_sentence_length,count_stopwords,flesch_reading_ease
0,the question is asking that in the same discip...,2017,"4, 5 - Est_Chen-fzn235-TOK_essay.docx",5,4,2,"Given access to the same facts, how is it poss...",0.20936,1.540025,1525,94,16.223404,788,60.08
1,our brains seek coherence structure and order ...,2017,"7, 6 - Eva GuoTOK_final_final_draft.docx",6,7,4,Humans are pattern-seeking animals and we are ...,0.344933,1.73138,1575,58,27.155172,724,32.8
2,in american heritage dictionary of the english...,2017,"7, 5 - fzn260_Yessica_Ji_Yuanyi_G12-9_TOKEssay...",5,7,4,"Given access to the same facts, how is it poss...",0.398082,1.861711,1208,41,29.463415,528,19.43
3,the statement in the prompt argues that diffic...,2017,"8, 1 - James Li TOK_Essay_4th_draft.docx",1,8,4,It is only knowledge produced with difficulty ...,0.386667,1.779394,1594,60,26.566667,732,29.33
4,human are patternseeking animals because patte...,2017,"7, 6 - Fzn323_Amy_Wang_Qiaohui_G12_TOK_Essay_D...",6,7,4,Humans are pattern-seeking animals and we are ...,0.254939,1.554493,1500,67,22.38806,703,52.6


## Play with gensim word2vec

In [28]:
import gensim

In [100]:
from gensim.models import Word2Vec

# Load pretrained model 
model = gensim.models.KeyedVectors.load_word2vec_format(path +\
    '/essay_grader/data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [56]:
vocab_obj = model.vocab["word"]
vocab_obj.count

2998437

In [57]:
vocab_obj = model.vocab["knowledge"]
vocab_obj.count

2998091

In [58]:
vocab_obj = model.vocab["Obama"]
vocab_obj.count

2999506

In [50]:
import pickle
with open("../essay_grader/pickle_data/lemma_df.pkl", "rb") as file:
        df = pickle.load(file)

In [52]:
df.head()

Unnamed: 0,text,year,name,title,score,level,title_name,vocab_richness,meanWordSyllable,word_count,sentence_count,avg_sentence_length,count_stopwords,reading_ease,lemmatized_text
0,the question is asking that in the same discip...,2017,"4, 5 - Est_Chen-fzn235-TOK_essay.docx",5,4,2,"Given access to the same facts, how is it poss...",0.20936,1.540025,1525,94,16.223404,788,60.08,"[[the, question, be, ask, that, in, the, same,..."
1,our brains seek coherence structure and order ...,2017,"7, 6 - Eva GuoTOK_final_final_draft.docx",6,7,4,Humans are pattern-seeking animals and we are ...,0.344933,1.73138,1575,58,27.155172,724,32.8,"[[we, brain, seek, coherence, structure, and, ..."
2,in american heritage dictionary of the english...,2017,"7, 5 - fzn260_Yessica_Ji_Yuanyi_G12-9_TOKEssay...",5,7,4,"Given access to the same facts, how is it poss...",0.398082,1.861711,1208,41,29.463415,528,19.43,"[[in, american, heritage, dictionary, of, the,..."
3,the statement in the prompt argues that diffic...,2017,"8, 1 - James Li TOK_Essay_4th_draft.docx",1,8,4,It is only knowledge produced with difficulty ...,0.386667,1.779394,1594,60,26.566667,732,29.33,"[[the, statement, in, the, prompt, argue, that..."
4,human are patternseeking animals because patte...,2017,"7, 6 - Fzn323_Amy_Wang_Qiaohui_G12_TOK_Essay_D...",6,7,4,Humans are pattern-seeking animals and we are ...,0.254939,1.554493,1500,67,22.38806,703,52.6,"[[human, be, patternseek, animal, because, pat..."


In [70]:
def word_commonness(lemmatized_list):
    sum_count = 0
    length_count = 0
    for i in lemmatized_list:
        for j in i:
            try:
                vocab_obj = model.vocab[j]
                sum_count += vocab_obj.count
                length_count += 1
            except KeyError as e:
                pass
    return sum_count/length_count      

In [71]:
df["word_commonness"] = df.lemmatized_text.apply(word_commonness)

In [72]:
df.head(3)

Unnamed: 0,text,year,name,title,score,level,title_name,vocab_richness,meanWordSyllable,word_count,sentence_count,avg_sentence_length,count_stopwords,reading_ease,lemmatized_text,word_commonness
0,the question is asking that in the same discip...,2017,"4, 5 - Est_Chen-fzn235-TOK_essay.docx",5,4,2,"Given access to the same facts, how is it poss...",0.20936,1.540025,1525,94,16.223404,788,60.08,"[[the, question, be, ask, that, in, the, same,...",2992275.0
1,our brains seek coherence structure and order ...,2017,"7, 6 - Eva GuoTOK_final_final_draft.docx",6,7,4,Humans are pattern-seeking animals and we are ...,0.344933,1.73138,1575,58,27.155172,724,32.8,"[[we, brain, seek, coherence, structure, and, ...",2988191.0
2,in american heritage dictionary of the english...,2017,"7, 5 - fzn260_Yessica_Ji_Yuanyi_G12-9_TOKEssay...",5,7,4,"Given access to the same facts, how is it poss...",0.398082,1.861711,1208,41,29.463415,528,19.43,"[[in, american, heritage, dictionary, of, the,...",2989039.0


In [75]:
df.groupby("level")["word_commonness"].mean()/300000

level
1    9.960910
2    9.958811
3    9.956124
4    9.955638
5    9.945923
Name: word_commonness, dtype: float64

In [97]:
df.groupby("level")["word_commonness"].agg(["mean","std"])

Unnamed: 0_level_0,mean,std
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2988273.0,6681.194699
2,2987643.0,6337.604019
3,2986837.0,6189.552744
4,2986691.0,6458.871722
5,2983777.0,7951.331454


In [79]:
text0 = df.text[0]

In [86]:
text0[:200]

'the question is asking that in the same discipline why experts often have different opinions on the same fact for example a scientist disagrees with other scientists discovers and results facts are th'

In [85]:
text0 = text0.replace(".","")

In [88]:
# text0.split(" ")

In [89]:
def word_commonness(text):
    sum_count = 0
    length_count = 0
    text = text.replace(".","")
    text_list = text.split(" ")
    for i in text_list:
        try:
            vocab_obj = model.vocab[i]
            sum_count += vocab_obj.count
            length_count += 1
        except KeyError as e:
            pass
    return sum_count/length_count  

In [90]:
df["word_commonness_2"] = df.text.apply(word_commonness)

In [95]:
df.groupby("level")["word_commonness_2"].mean()/3000000

level
1    0.995750
2    0.995544
3    0.995182
4    0.995156
5    0.991894
Name: word_commonness_2, dtype: float64

In [96]:
df.groupby("level")["word_commonness_2"].std()/3000000

level
1    0.002130
2    0.002242
3    0.002630
4    0.002440
5    0.005383
Name: word_commonness_2, dtype: float64

## Test text_feature file

In [6]:
import pickle
with open("../essay_grader/pickle_data/lemma_df.pkl", "rb") as file:
        lemma_df = pickle.load(file)

In [8]:
lemma_df.lemmatized_text[0]

[['the',
  'question',
  'be',
  'ask',
  'that',
  'in',
  'the',
  'same',
  'discipline',
  'why',
  'expert',
  'often',
  'have',
  'different',
  'opinion',
  'on',
  'the',
  'same',
  'fact',
  '.'],
 ['for',
  'example',
  'a',
  'scientist',
  'disagree',
  'with',
  'other',
  'scientist',
  'discover',
  'and',
  'result',
  '.'],
 ['fact',
  'be',
  'the',
  'phenomena',
  'that',
  'can',
  'be',
  'commonly',
  'observe',
  'by',
  'we',
  'and',
  'can',
  'not',
  'be',
  'change',
  'by',
  'we',
  '.'],
 ['they',
  'be',
  'the',
  'evidence',
  'and',
  'the',
  'basic',
  'knowledge',
  'that',
  'we',
  'can',
  'easily',
  'get',
  '.'],
 ['for', 'example', 'the', 'earth', 'be', 'a', 'sphere', '.'],
 ['this',
  'be',
  'a',
  'fact',
  'that',
  'we',
  'already',
  'prove',
  'and',
  'observe',
  'from',
  'space',
  '.'],
 ['the',
  'evidence',
  'be',
  'sufficient',
  'to',
  'prove',
  'the',
  'fact',
  'by',
  'use',
  'reasoning',
  'as',
  'a',
  'way',

In [28]:
def remove_p(list_of_list):
    for i in list_of_list:
        try:
            i.remove(".")
        except ValueError:
            pass
    return list_of_list

In [31]:
# lemma_df.lemmatized_text.apply(remove_p)

In [32]:
df = pd.read_csv("../essay_grader/data/essay_data.csv")

In [33]:
df.head()

Unnamed: 0,text,year,name,title,score,level,title_name
0,"The question is asking that, in the same disci...",2017,"4, 5 - Est_Chen-fzn235-TOK_essay.docx",5,4,2,"Given access to the same facts, how is it poss..."
1,"Our brains seek coherence, structure, and orde...",2017,"7, 6 - Eva GuoTOK_final_final_draft.docx",6,7,4,Humans are pattern-seeking animals and we are ...
2,In American Heritage® Dictionary of the Englis...,2017,"7, 5 - fzn260_Yessica_Ji_Yuanyi_G12-9_TOKEssay...",5,7,4,"Given access to the same facts, how is it poss..."
3,The statement in the prompt argues that diffic...,2017,"8, 1 - James Li TOK_Essay_4th_draft.docx",1,8,4,It is only knowledge produced with difficulty ...
4,Human are pattern-seeking animals because patt...,2017,"7, 6 - Fzn323_Amy_Wang_Qiaohui_G12_TOK_Essay_D...",6,7,4,Humans are pattern-seeking animals and we are ...


In [34]:
gen_df = gen_text_feature(df)

In [36]:
gen_df.groupby("level").mean()

Unnamed: 0_level_0,year,title,score,vocab_richness,mean_word_syllable,word_count,sentence_count,avg_sentence_length,count_stopwords,flesch_reading_ease,word_commonness
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2018.428571,3.306122,1.836735,0.311507,1.663672,1492.632653,71.673469,21.368343,690.510204,44.399184,9.957501
2,2018.681818,3.472727,3.581818,0.309585,1.691718,1485.168182,68.55,22.235258,678.327273,41.146864,9.955442
3,2018.698113,3.578616,5.402516,0.309794,1.713382,1531.264151,67.90566,23.05562,691.220126,38.481509,9.951818
4,2018.796875,3.5625,7.34375,0.314469,1.737963,1536.0,67.515625,23.388957,691.421875,36.063594,9.951556
5,2017.6,4.4,9.0,0.320593,1.715316,1554.8,61.2,26.369376,698.2,34.952,9.918938


In [37]:
df.to_pickle('../essay_grader/pickle_data/text_feature_df.pkl')