In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.linalg import norm

Importing the files

In [2]:
df_train_01 = pd.read_csv('./DataFiles/train_01.csv')
df_dev_01 = pd.read_csv('./DataFiles/dev_01.csv')
df_test_01 = pd.read_csv('./DataFiles/test_01.csv')

TFid Calculation for Training Data

In [3]:
tfidf = []

for i in range(0,(len(df_train_01)-1)):
    s1 = str(df_train_01['Sent_1'][i])
    s2 = str(df_train_01['Sent_2'][i])

    # transfer to TF Matrix
    cv = TfidfVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    # tfidf
    tfidf.append(np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1])))

# train data for model training
#axis=1, add tfidf and Label 
train_data = pd.concat([df_train_01['Topic_Id'],pd.DataFrame(tfidf),df_train_01['Label']],axis=1)
train_data.columns = ['Topic_Id','distance','Label']

if(train_data.isna().sum().sum()) > 0:
    #drop NaN values
    train_data = train_data.dropna()

# write the train data to csv file
train_data.to_csv("./DataFiles/train_data_alg3.csv",index=0)

TFid Calculation for Dev Data

In [4]:
tfidf_dev = []

for i in range(0,(len(df_dev_01)-1)):
    s1 = str(df_dev_01['Sent_1'][i])
    s2 = str(df_dev_01['Sent_2'][i])
    # transfer to TF Matrix
    cv = TfidfVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    # tfidf
    tfidf_dev.append(np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1])))
    #print(tfidf_dev)

# dev data for model training
#axis=1, add jaccard_dis_dev and Label 
dev_data = pd.concat([df_dev_01['Topic_Id'],pd.DataFrame(tfidf_dev),df_dev_01['Label']],axis=1)
dev_data.columns = ['Topic_Id','distance','Label']

if(dev_data.isna().sum().sum()) > 0:
    #drop NaN values
    dev_data = dev_data.dropna()

# write the dev data to csv file
dev_data.to_csv("./DataFiles/dev_data_alg3.csv",index=0)

TFid Calculation for Test Data

In [5]:
tfidf_test = []

for i in range(0,(len(df_test_01)-1)):
    s1 = str(df_test_01['Sent_1'][i])
    s2 = str(df_test_01['Sent_2'][i])
    # transfer to TF Matrix
    cv = TfidfVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    # tfidf
    tfidf_test.append(np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1])))
    #print(tfidf_test)

# test data for model training
#axis=1, add jaccard_dis_test and Label 
test_data = pd.concat([df_test_01['Topic_Id'],pd.DataFrame(tfidf_test),df_test_01['Label']],axis=1)
test_data.columns = ['Topic_Id','distance','Label']

if(test_data.isna().sum().sum()) > 0:
    #drop NaN values
    test_data = test_data.dropna()

# write the test data to csv file
test_data.to_csv("./DataFiles/test_data_alg3.csv",index=0)


Training the model with Decision Tree Classifier

In [6]:
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [7]:
# train data_X, train_data_y
train_data_y = train_data['Label']
train_data_X = train_data.copy()
train_data_X.drop(['Label'],axis=1,inplace=True)


# dev_data_X, dev_data_y
dev_data_y = dev_data['Label']
dev_data_X = dev_data.copy()
dev_data_X.drop(['Label'],axis=1,inplace=True)

# test_data_X, test_data_y
test_data_y = test_data['Label']
test_data_X = test_data.copy()
test_data_X.drop(['Label'], axis=1, inplace=True)

In [8]:
clf_tree = tree.DecisionTreeClassifier(criterion="entropy")
clf_tree = clf_tree.fit(train_data_X,train_data_y)
score_tree = clf_tree.score(dev_data_X, dev_data_y)
print("\n score_tree:",score_tree)
print("\n")

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

dev_y_pred = clf_tree.predict(dev_data_X)
dev_y_true = dev_data_y
f1 = f1_score(dev_y_true,dev_y_pred)
print("f1_score:",f1)
print("\n")

print(classification_report(dev_y_true,dev_y_pred))



 score_tree: 0.6831683168316832


f1_score: 0.4810126582278481


              precision    recall  f1-score   support

           0       0.72      0.83      0.77      2671
           1       0.57      0.41      0.48      1470

    accuracy                           0.68      4141
   macro avg       0.65      0.62      0.63      4141
weighted avg       0.67      0.68      0.67      4141



Testing the model

In [9]:
test_y_pred = clf_tree.predict(test_data_X)
test_y_true = test_data_y
f1_test = f1_score(test_y_true,test_y_pred)
print("\n f1_score_test:",f1_test)
print("\n test:")
print(classification_report(test_y_true, test_y_pred))



 f1_score_test: 0.37500000000000006

 test:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       662
           1       0.44      0.33      0.38       175

    accuracy                           0.77       837
   macro avg       0.64      0.61      0.62       837
weighted avg       0.75      0.77      0.76       837



In [10]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(train_data_X, train_data_y)
clf.score(dev_data_X, dev_data_y)

0.7307413668196088

In [11]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()
clf_rf = clf_rf.fit(train_data_X,train_data_y)
score_rf = clf_rf.score(dev_data_X, dev_data_y)