-
Notifications
You must be signed in to change notification settings - Fork 0
/
CRF.py
106 lines (89 loc) · 3.86 KB
/
CRF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pandas as pd
import time
import pickle
import re
from NER.generate_features import Features
from NER.tags import tagger
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report,flat_accuracy_score
import spacy
import nltk
import editdistance
class ConditionalRandomField():
def __init__(self,num_features):
self.num_features = num_features
def train(self,data,y,tag):
self.data=data
self.y=y
self.tag=tag
# Tagging each word in data to its corresponding tag
a = tagger(data)
taggdata=a.tag()
return taggdata
def train1(self,data,y,tag):
#tagged_data = a.fit(a.tag(),y,tag)
# Features as conditional random field accepts
feaobj= Features(data,self.num_features)
x_train,y_train = feaobj.get
print("labelled data")
# Using conditional random field as features
crf = CRF(algorithm='lbfgs',c1=0.1,c2=0.1,max_iterations=100,all_possible_transitions=False)
print(crf)
crf.fit(x_train,y_train)
# Saving the model which is trained
filename = 'finalized_model.sav'
pickle.dump(crf, open(filename, 'wb'))
# Prediction on train
pred = crf.predict(x_train)
# printing classification report and Accuracy
print('\n \n Prediction On Trained Data:\n \n',flat_classification_report(y_train,pred))
print('Accuracy:',flat_accuracy_score(y_train,pred))
def predict(self,data,y = None, tag = None):
if (y != None) and (tag != None):
# Tagging each word in data to its corresponding tag
t = tagger(X = data)
tagged_data_ = t.fit(X = t.tag(),y = y,tag = tag)
# Generates features required for conditional random field
f = Features(X = tagged_data_,num_words=self.num_features)
x_test,y_test = f.get
# Gets trained model from finalized_model.sav
loaded_model = pickle.load(open('finalized_model.sav', 'rb'))
# prediction on test data
result = loaded_model.predict(x_test)
#printing classification report and Accuracy
print('\n\n Classification Report: \n',flat_classification_report(y_test,result))
print('Accuracy:',flat_accuracy_score(y_test,result))
elif (y == None) and (tag == None):
# data is tagged with list of tuples (token, pos tag, leammatized word, other tag)
t = tagger(X = data)
tagged_data_ = t.tag()
# Generates features required for conditional random field
f = Features(X = tagged_data_,num_words= self.num_features)
x_test,_ = f.get
# Gets trained model from finalized_model.sav
loaded_model = pickle.load(open('finalized_model.sav', 'rb'))
# prediction on test data
result = loaded_model.predict(x_test)
# # tokenizing test data
# final=pd.DataFrame()
# final['description'] = [re.findall('[A-Za-z0-9]+',i) for i in data]
# final['result']=result
# def func(df,tag):
# mainlist=[]
# for i in range(len(df)):
# sublist=[]
# desc=df['result'].iloc[i]
# for j in range(len(desc)):
# if(tag==desc[j]):
# sublist.append(df['description'].iloc[i][j])
# if(len(sublist)!=0):
# mainlist.append(' '.join(sublist))
# else:
# mainlist.append("not assigned")
# return mainlist
# products=func(final,'P')
# issues=func(final,'I')
# finalresult=pd.DataFrame()
# finalresult['Products']=products
# finalresult['Issues']=issues
return result