In [1]:
from helpers.ExtractFeatures import ExtractFeatures,ReadFeatureFiles
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
class SpecificityModel:
    def __init__(self):
        self.df_pdtb = pd.read_csv('pdtb2.csv',low_memory=False)
        self.df_patent = pd.read_csv('bigPatentData_csv/train.csv')
    
    def extract_features(self):
        fe = ExtractFeatures(self.df_pdtb,self.df_patent)
        fe.extract_features('i')
        fe.extract_features('s')
    
    def read_features(self):
        self.obj_read_feats = ReadFeatureFiles()
        self.obj_read_feats.read_features()
        
    def train(self,sent_type='i',split_size=0.2):
        if sent_type == 'i':
            df_wo_labels = self.obj_read_feats.df_i[self.obj_read_feats.df_i.columns.drop('labels')]
            self.in_feats = np.concatenate((df_wo_labels.values,self.obj_read_feats.wf_i),axis = 1)
            self.y_true = self.obj_read_feats.df_i.labels.values
            str_folder_name = 'instantiation'
        elif sent_type == 's':
            df_wo_labels = self.obj_read_feats.df_s[self.obj_read_feats.df_s.columns.drop('labels')]
            self.in_feats = np.concatenate((df_wo_labels.values,self.obj_read_feats.wf_s),axis = 1)
            self.y_true = self.obj_read_feats.df_s.labels.values
            str_folder_name = 'specification'
            
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.in_feats, self.y_true, test_size=split_size, random_state=5,shuffle=True)
        self.clf = LogisticRegression(solver='liblinear')
        self.clf.fit(np.asarray(self.X_train), np.asarray(self.y_train))
        
        
        with open('models/'+str_folder_name+'/'+'specificity_model.pickle', 'wb') as handle:
            pickle.dump(self.clf, handle, protocol=pickle.HIGHEST_PROTOCOL) 
        
        
        
        
    def predict(self):
        y_pred = self.clf.predict_proba(np.asarray(self.X_test))
        y_pred_int = np.argmax(1*(y_pred > 0.5),axis=1)
        return y_pred_int
    
    def cross_val_scores(self,folds=10):
        clf = LogisticRegression(solver='liblinear')
        cv = KFold(n_splits=folds, random_state=1, shuffle=True)
        scores = cross_val_score(clf, in_feats, y_true, scoring='accuracy', cv=cv, n_jobs=-1)
        return scores
    

In [3]:
a = SpecificityModel()

In [4]:
#a.extract_features()
a.read_features()

Instantiation Features found!
features/instantiation/necd_features.pickle
features/instantiation/polarity_features.pickle
features/instantiation/sentence_length_features.pickle
features/instantiation/specificity_features.pickle
features/instantiation/syntactic_features.pickle
features/instantiation/lm_features.pickle
Specification Features found!
features/specification/necd_features.pickle
features/specification/polarity_features.pickle
features/specification/sentence_length_features.pickle
features/specification/specificity_features.pickle
features/specification/syntactic_features.pickle
features/specification/lm_features.pickle


In [7]:
import pickle

In [8]:
a.train('i')

In [None]:
a.predict()

In [None]:
in_feats_i.shape

In [None]:

#y_true_s = a.df_s.labels.values
#y_true = np.concatenate((y_true_i,y_true_s[:1]),axis = 0)
y_true_i

In [None]:
X_train, X_test, y_train, y_test = train_test_split(in_feats_i, y_true_i, test_size=0.2, random_state=5,shuffle=True)

In [None]:
X_train.shape

In [None]:
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(np.asarray(X_train))
clf = LogisticRegression(solver='liblinear')

clf.fit(np.asarray(X_train), np.asarray(y_train))

In [None]:
#X_test_scaled = scaler.transform(np.asarray(X_test))


In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_int).ravel()

In [None]:
(tp, fn, fp, tn)

In [None]:
accuracy_score(y_test, y_pred_int)

In [None]:
recall_score(y_test, y_pred_int)

In [None]:
precision_score(y_test, y_pred_int)

In [None]:
X_test.shape

In [None]:
sum(scores)/len(scores)

In [None]:
scores

In [None]:
'''
Scores:
1. All features, no standard scaling
Accuracy:
Precision:
Recall:

2. All features with standard scaling
Accuracy:99.35
Precision:99.62
Recall:99.07

3. Word features only, with standard scaling
same as 2.

In [1]:
import pandas as pd

In [2]:
df_pdtb = pd.read_csv('pdtb2.csv',low_memory=False)

In [3]:
implicit_rel = df_pdtb[df_pdtb['Relation']=='Implicit']

In [None]:
implicit_rel[implicit_rel[]]

In [8]:
a = implicit_rel[(implicit_rel['ConnHeadSemClass1'] == 'Expansion.Instantiation') | (implicit_rel['ConnHeadSemClass2'] == 'Expansion.Instantiation')]
a[['Relation','ConnHeadSemClass1','ConnHeadSemClass2','FullRawText']]

Unnamed: 0,Relation,ConnHeadSemClass1,ConnHeadSemClass2,FullRawText
29,Implicit,Expansion.Instantiation,,"Despite recent declines in yields, investors c..."
31,Implicit,Expansion.Instantiation,,"Typically, money-fund yields beat comparable s..."
32,Implicit,Expansion.Instantiation,,The top money funds are currently yielding wel...
187,Implicit,Expansion.Instantiation,,There were many pioneer PC contributors. Willi...
245,Implicit,Expansion.Instantiation,,But some European funds recently have skyrocke...
...,...,...,...,...
40461,Implicit,Expansion.Instantiation,,"Analysts, who were expecting Alcoa to post aro..."
40487,Implicit,Expansion.Instantiation,,Such proclamations leave network officials all...
40493,Implicit,Expansion.Instantiation,,But recent developments have made the networks...
40548,Implicit,Expansion.Instantiation,,Intel's business is strong. Our bookings impro...


In [12]:
b = implicit_rel[(implicit_rel['ConnHeadSemClass1'] == 'Expansion.Restatement.Specification')|(implicit_rel['ConnHeadSemClass2'] == 'Expansion.Restatement.Specification')]
b[['Relation','ConnHeadSemClass1','ConnHeadSemClass2','FullRawText']]

Unnamed: 0,Relation,ConnHeadSemClass1,ConnHeadSemClass2,FullRawText
4,Implicit,Expansion.Restatement.Specification,,This is an old story. We're talking about year...
14,Implicit,Expansion.Restatement.Specification,,About 160 workers at a factory that made paper...
64,Implicit,Expansion.Restatement.Specification,,"South Korea's economic boom, which began in 19..."
67,Implicit,Expansion.Restatement.Specification,,Newsweek's ad rates would increase 5% in Janua...
73,Implicit,Expansion.Restatement.Specification,,"it will introduce the Circulation Credit Plan,..."
...,...,...,...,...
40424,Implicit,Expansion.Restatement.Specification,,"By diversifying supply sources, the toy makers..."
40425,Implicit,Expansion.Restatement.Specification,,It wouldn't be easy to duplicate quickly the m...
40456,Implicit,Expansion.Restatement.Specification,,The administration urged the justices to adopt...
40551,Implicit,Expansion.Restatement.Specification,,Our bookings improved as the quarter progresse...


Unnamed: 0,Relation,ConnHeadSemClass1,ConnHeadSemClass2,FullRawText
4,Implicit,Expansion.Restatement.Specification,,This is an old story. We're talking about year...
5,Implicit,Expansion.Conjunction,Comparison,We're talking about years ago before anyone he...
6,Implicit,Contingency.Cause.Result,,Neither Lorillard nor the researchers who stud...
9,Implicit,Expansion.Conjunction,,Among 33 men who worked closely with the subst...
10,Implicit,Expansion.Conjunction,,Among 33 men who worked closely with the subst...
...,...,...,...,...
40589,Implicit,Contingency.Cause.Reason,,"if it does, Pretoria will use this as a reason..."
40591,Implicit,Comparison.Contrast,,"In addition, the government is figuring that t..."
40593,Implicit,Expansion.Restatement.Specification,,The men also will be faced with bridging the g...
40594,Implicit,Contingency.Cause.Result,,They never considered themselves to be anythin...
