In [34]:
import re
import numpy as np
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
train_df = pd.read_csv("data\\train.tsv", sep="\t", error_bad_lines=False, warn_bad_lines=False)

In [36]:
train_df.shape

(3549, 5)

In [37]:
train_df.head()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,702876,702977,"Amrozi accused his brother , whom he called "" ...","Referring to him as only "" the witness "" , Amr..."
1,0,2108705,2108831,Yucaipa owned Dominick 's before selling the c...,Yucaipa bought Dominick 's in 1995 for $ 693 m...
2,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10 , the ship 's owners had published ..."
3,0,3344667,3344648,"Around 0335 GMT , Tab shares were up 19 cents ...","Tab shares jumped 20 cents , or 4.6 % , to set..."
4,1,1236820,1236712,"The stock rose $ 2.11 , or about 11 percent , ...",PG & E Corp. shares jumped $ 1.63 or 8 percent...


In [38]:
test_df = pd.read_csv("data\\test.tsv", sep="\t", error_bad_lines=False, warn_bad_lines=False)

In [39]:
test_df.shape

(1639, 5)

In [40]:
test_df.head()

Unnamed: 0,index,#1 ID,#2 ID,#1 String,#2 String
0,0,1089874,1089925,"PCCW 's chief operating officer , Mike Butcher...",Current Chief Operating Officer Mike Butcher a...
1,1,3019446,3019327,The world 's two largest automakers said their...,Domestic sales at both GM and No. 2 Ford Motor...
2,2,1945605,1945824,According to the federal Centers for Disease C...,The Centers for Disease Control and Prevention...
3,3,1430402,1430329,A tropical storm rapidly developed in the Gulf...,A tropical storm rapidly developed in the Gulf...
4,4,3354381,3354396,The company didn 't detail the costs of the re...,But company officials expect the costs of the ...


In [41]:
dev_df = pd.read_csv("data\\dev.tsv", sep="\t", error_bad_lines=False, warn_bad_lines=False)

In [42]:
dev_df.shape

(388, 5)

In [43]:
dev_df.head()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,1355540,1355592,He said the foodservice pie business doesn 't ...,The foodservice pie business does not fit our...
1,0,487993,487952,"The dollar was at 116.92 yen against the yen ,...","The dollar was at 116.78 yen JPY = , virtually..."
2,1,1989515,1989458,The AFL-CIO is waiting until October to decide...,The AFL-CIO announced Wednesday that it will d...
3,0,1783137,1782659,No dates have been set for the civil or the cr...,No dates have been set for the criminal or civ...
4,1,3039165,3039036,Wal-Mart said it would check all of its millio...,It has also said it would review all of its do...


In [44]:
lem = WordNetLemmatizer()
def clean_text(text):
    # remove special characters, punctuations, digits
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # convert to lowercase
    text = text.lower()
    
    # Convert to list from string
    text = text.split()
    
    # lemmatisation
    text = [lem.lemmatize(word) for word in text] 
    text = " ".join(text)
    return text

### Using bag of words - baseline

In [45]:
def build_corpus_from_df(df_list, col_list):
    corpus = list()
    for df in df_list:
        for col in col_list:
            corpus += df[col].values.astype(str).tolist()
    return corpus

In [46]:
corpus = build_corpus_from_df([train_df, test_df, dev_df], ["#1 String", "#2 String"])

In [47]:
vectorizer = CountVectorizer(stop_words='english').fit(corpus)

In [48]:
type(vectorizer)

sklearn.feature_extraction.text.CountVectorizer

In [49]:
def transform_get_score(vectorizer, col1, col2):
    return cosine_similarity(vectorizer.transform([str(col1)]), vectorizer.transform([str(col2)]))[0][0]

In [50]:
cosine_similarity(vectorizer.transform(["Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion ."]), vectorizer.transform(["Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 ."]))[0][0]

0.5590169943749475

In [51]:
similarity_score = dev_df.apply(lambda x: transform_get_score(vectorizer, x["#1 String"], x["#2 String"]), axis=1).values

In [52]:
predicted_values = np.where(similarity_score>.5, 1, 0)

In [53]:
actual_values = dev_df['Quality'].values

In [54]:
f1_score(actual_values, predicted_values)

0.8019801980198019

### Using transformers

In [55]:
from sentence_transformers import SentenceTransformer, util

In [111]:
model = SentenceTransformer("paraphrase-mpnet-base-v2")

I0625 00:39:48.798555   200 SentenceTransformer.py:41] Load pretrained SentenceTransformer: paraphrase-mpnet-base-v2
I0625 00:39:48.800557   200 SentenceTransformer.py:45] Did not find folder paraphrase-mpnet-base-v2
I0625 00:39:48.802564   200 SentenceTransformer.py:51] Search model on server: http://sbert.net/models/paraphrase-mpnet-base-v2.zip
I0625 00:39:48.805566   200 SentenceTransformer.py:107] Load SentenceTransformer from folder: C:\Users\HP/.cache\torch\sentence_transformers\sbert.net_models_paraphrase-mpnet-base-v2
I0625 00:40:12.329288   200 SentenceTransformer.py:131] Use pytorch device: cpu


In [120]:
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

I0625 00:42:42.278540   200 SentenceTransformer.py:41] Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
I0625 00:42:42.280544   200 SentenceTransformer.py:45] Did not find folder paraphrase-MiniLM-L6-v2
I0625 00:42:42.282542   200 SentenceTransformer.py:51] Search model on server: http://sbert.net/models/paraphrase-MiniLM-L6-v2.zip
I0625 00:42:42.285548   200 SentenceTransformer.py:107] Load SentenceTransformer from folder: C:\Users\HP/.cache\torch\sentence_transformers\sbert.net_models_paraphrase-MiniLM-L6-v2
I0625 00:42:42.885028   200 SentenceTransformer.py:131] Use pytorch device: cpu


In [121]:
sent1 = "I am not happy"
sent2 = "I am sad"

In [122]:
sent1_embedding = model.encode(sent1, convert_to_tensor=True)
sent2_embedding = model.encode(sent2, convert_to_tensor=True)







In [123]:
similarity_score = util.pytorch_cos_sim(sent1_embedding, sent2_embedding)

In [124]:
similarity_score

tensor([[0.7270]])

In [86]:
def transform_get_score(model, val1, val2):
    return util.pytorch_cos_sim(model.encode(str(val1), convert_to_tensor=True), model.encode(str(val2), convert_to_tensor=True))

In [107]:
similarity_score = dev_df.apply(lambda x: transform_get_score(model, x["#1 String"], x["#2 String"]), axis=1).values

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [108]:
predicted_values = np.where(similarity_score>.75, 1, 0)

In [109]:
actual_values = dev_df['Quality'].values

In [110]:
f1_score(actual_values, predicted_values)

0.7899807321772638