In [3]:
import pandas as pd
import string 

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB



In [5]:
df = pd.read_csv('../essay_grader/data/essay_data.csv')
#convert the 5 level 5 essays to a level 4
df['level'] = df['level'].apply(lambda x: x if x < 4 else 4)
df['level'].value_counts()

2    220
3    159
4     69
1     49
Name: level, dtype: int64

In [24]:
df['text'][0].count('\n')

52

### Simple pre-processing to use for Doc2Vec model

In [3]:
def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text
def remove_punctuation(review):
    for punctuation in string.punctuation:
        review = review.replace(punctuation, ' ')
    return review

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [4]:

df['text'] = df['text'].str.lower().apply(lambda x: remove_numbers(x)) #remove numbers
df['text'] = df['text'].apply(remove_punctuation) #removes ENGLISH punctuations
df['text'] = df['text'].str.replace('\n', ' ') #remove new line characters
df['text'] = df['text'].apply(lambda x: word_tokenize(x)) 
# df['text'] = df['text'].apply(lambda x:[w for w in x if w not in stop_words]) #can try to remove stop words if needed as well.
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


#can try to use a stemmer here to see how it improve the model later.


### Split data into training and testing set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = ['level']), df['level'], random_state = 0, test_size = 0.2)

### Vectorize the data

#### Doc2Vec

Functions to create the vector and vectorize the text

In [7]:
def d2v_create_vectors(text, dm, vector_size, negative, hs, min_count , sample , window):
    """
    creates vectors from text files using the doc2vec model.
    This is called by the doc2vec_vectorize function and returns a model as well as 
    documents which include the documents and the document tags.
    Paramaters for the function are:
    dm ({0,1}, optional) 1: PV-DM, 0: PV-DBOW 
    vector size, negative, hs, min_count, sample
    """
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(text)]
    cores = multiprocessing.cpu_count()
    model = Doc2Vec(window = window, dm = dm, vector_size = vector_size, negative = negative, hs = hs, min_count = min_count, sample = sample, works = cores)
    model.build_vocab(documents)
    model.train(documents, total_examples = len(documents), epochs = 30)
    return model, documents

def doc2vec_vectorize(df, col, dm =0, vector_size =300,  negative = 5, hs = 0, min_count = 2, sample = 0, window = 7):
    """function to vectorize the text using the doc2vec method."""
    df = df.copy()
    df.reset_index(inplace = True)
    vecs = []
    model, documents = d2v_create_vectors(df[col], dm, vector_size, negative, hs, min_count , sample , window)
    for doc_id in range(len(documents)):
        vecs.append(model.docvecs[doc_id])
    new_col = 'd2v_' + col
    df[new_col] = pd.Series(vecs)
    return df

##### Test to make sure the vecotrization worked
- This checks to see which document each vector is most similar to.  The counter should show that almost all documents are most similar to themselves.

- The print lines print the most simial document (it self usually) the second most, the median, and the least similar.

In [41]:
ranks = []
second_ranks = []
for doc_id in range(len(documents)):
    inferred_vector = model.infer_vector(documents[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [42]:
import collections
counter = collections.Counter(ranks)
print(counter)

Counter({0: 397})


In [46]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(documents[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(documents[sims[index][0]].words)))

#### Tf-IDF with n-grams

In [270]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer().fit(df['text'])

### Train and Test model

Vectorize both the train and test sets

In [9]:
X_train['text'][0]

'The question is asking that, in the same discipline, why experts often have different opinions on the same fact. For example, a scientist disagrees with other scientists’ discovers and results. Facts are the phenomena that can be commonly observed by us and can not be changed by us. They are the evidence and the basic knowledge that we can easily get. For example, the earth is a sphere. This is a fact that we already proved and observed from space. The evidence is sufficient to prove the fact by using reasoning as a way of knowing, and we now treat this fact as knowledge.\n\nThat\'s how we gain knowledge from facts. However, facts are not the same in all disciplines. Based on the different ways of knowing used in different area of knowledge, facts are evaluated differently. Arguments are mainly caused by the different opinions or knowledge between two parties. Two experts may have disagreements on the understanding of the same phenomenon. Experts in the same discipline consist of diff

In [82]:
%time
#can choose doc2vec, Tf-Idf, or word2vec
X_train_vecs = doc2vec_vectorize(X_train, 'text')
X_test_vecs = doc2vec_vectorize(X_test, 'text')

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs


Cross-validate one of the below models.

In [75]:
log_model = LogisticRegression()
svc_model = SVC(kernel = 'sigmoid')
random_for_model = RandomForestClassifier()
nb_log = MultinomialNB() #can try to Tf-Idf but not doc2vec


cv = cross_validate(svc_model, X_train_vecs['d2v_text'].to_list(), y_train, cv = 5, n_jobs = -1, scoring = "accuracy", verbose = 1, return_estimator=True)
cv

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


{'fit_time': array([0.05619621, 0.05933285, 0.0507772 , 0.05084705, 0.04935598]),
 'score_time': array([0.0096221 , 0.01016116, 0.00927591, 0.00996494, 0.00936723]),
 'estimator': (SVC(kernel='sigmoid'),
  SVC(kernel='sigmoid'),
  SVC(kernel='sigmoid'),
  SVC(kernel='sigmoid'),
  SVC(kernel='sigmoid')),
 'test_score': array([0.4       , 0.4625    , 0.40506329, 0.49367089, 0.35443038])}

In [81]:
estimator = cv['estimator'][4]
y_pred = estimator.predict(X_test_vecs['d2v_text'].to_list())
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        13
           2       0.50      0.79      0.61        48
           3       0.25      0.22      0.24        27
           4       0.00      0.00      0.00        12

    accuracy                           0.44       100
   macro avg       0.19      0.25      0.21       100
weighted avg       0.31      0.44      0.36       100



  _warn_prf(average, modifier, msg_start, len(result))


In [306]:
estimator = cv['estimator'][1]
y_pred = estimator.predict(X_test_vecs)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.14      0.08      0.10        13
           2       0.48      0.52      0.50        48
           3       0.21      0.22      0.21        27
           4       0.25      0.25      0.25        12

    accuracy                           0.35       100
   macro avg       0.27      0.27      0.27       100
weighted avg       0.34      0.35      0.34       100

