# Feature Engineering

In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import nltk 
from nltk import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
import string
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

### Data Information

Given the title of a fake news article A and the title of a coming news article B, participants are asked to classify B into one of the three categories.

- agreed: B talks about the same fake news as A
- disagreed: B refutes the fake news in A
- unrelated: B is unrelated to A

### Data fields

- id - the id of each news pair.
- tid1 - the id of fake news title 1.
- tid2 - the id of news title 2.
- title1_zh - the fake news title 1 in Chinese.
- title2_zh - the news title 2 in Chinese.
- title1_en - the fake news title 1 in English.
- title2_en - the news title 2 in English.
- label - indicates the relation between the news pair: agreed/disagreed/unrelated.
- The English titles are machine translated from the related Chinese titles. This may help participants from all background to get better understanding of the datasets. Participants are highly recommended to use the Chinese version titles to finish the task.

### File type

- train.csv - training data contains 320,767 news pairs in both Chinese and English. This file provides the only data you can use to finish the task. Using external data is not allowed.

In [4]:
def load_data(path):
    
    data = pd.read_csv(path)
    data = data.set_index("id")
    data = data.sort_index()
    
    return data 

In [5]:
## Function to remove Punctuation from the string, even attached to the word. 
def RemovePunctuation(my_str):
    punctuations = string.punctuation
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char.lower()
            
    return no_punct

In [6]:
## Tokenize the string
def Tokenize(my_str):
    text = my_str.split()
    return text

In [7]:
## Remove Stop Word 
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
def StopWordRemoval(my_str):
    l = [word for word in my_str if word not in stopwords]
    return l

In [8]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemma(my_str):
    l = [lemmatizer.lemmatize(word) for word in my_str]
    return l

In [9]:
def TextProcessing(text):
    
    text = RemovePunctuation(text)
    text = Tokenize(text)
    text = StopWordRemoval(text)
    text = lemma(text)
    
    return text

In [10]:
data = load_data("train.csv")

## Word Count and Length of the sentence

In [12]:
def normalized_word_share(row):
    w1 = set(map(lambda word: word.lower().strip(), row['title1_en'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['title2_en'].split(" ")))    
    return 1.0 * len(w1 & w2)/(len(w1) + len(w2))

In [13]:
def WordCount(data):
    data['t1len'] = data['title1_en'].str.len()
    data['t2len'] = data['title2_en'].str.len()

    data['t1_n_words'] = data['title1_en'].apply(lambda row: len(row.split(" ")))
    data['t2_n_words'] = data['title2_en'].apply(lambda row: len(row.split(" ")))
    
    data['word_share'] = data.apply(normalized_word_share, axis=1)
    
    return data

**********************

##### Applying the above define funcitons on dataset

In [18]:
def preprocess(data):
    """Creating another dataframe in to process the text """
    df = data[["title1_en", "title2_en"]]
    
    """Applying WordCount Feature"""
    df = WordCount(df)
    
    """Applying TextProcessing function"""
    df["title1_en"] = df["title1_en"].apply(lambda x: TextProcessing(x))
    df["title2_en"] = df["title2_en"].apply(lambda x: TextProcessing(x))
    
#     """Creating Dictionary column by adding tile1_en and title2_en and applying Dictionary function to"""
#     df["BagOfWords"] = df['title1_en'] + df["title2_en"]
#     df.BagOfWords = df.BagOfWords.map(BagOfWords)
    
#     # Creating "TF1" and "TF2" variable containing a list of Frequency for each word in title1 and title 2 and
#     # adding a column for Cosine Similarity. 
#     df["TF1"] = TermFrequency(df.BagOfWords.values, df.title1_en.values)
#     df["TF2"] = TermFrequency(df.BagOfWords.values, df.title2_en.values)
#     df["Cosine_similarity"] = np.vectorize(Cosine)(df["TF1"], df["TF2"])
#     df["senti"] = data[["senti"]]
    df["label"] = data[['label']]
    
    return df

In [19]:
df = preprocess(data)

In [20]:
df.head()

Unnamed: 0_level_0,title1_en,title2_en,t1len,t2len,t1_n_words,t2_n_words,word_share,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,"[two, new, oldage, insurance, benefit, old, pe...","[police, disprove, bird, nest, congress, perso...",94,111,17,18,0.057143,unrelated
1,"[come, shenzhen, sooner, later, son, also, com...","[gdp, overtopped, hong, kong, shenzhen, clarif...",144,73,28,11,0.078947,unrelated
2,"[come, shenzhen, sooner, later, son, also, com...","[shenzhens, gdp, topped, hong, kong, last, yea...",144,101,28,15,0.071429,unrelated
3,"[come, shenzhen, sooner, later, son, also, com...","[shenzhens, gdp, outstrips, hong, kong, shenzh...",144,106,28,15,0.071429,unrelated
4,"[come, shenzhen, sooner, later, son, also, com...","[shenzhens, gdp, overtakes, hong, kong, bureau...",144,107,28,16,0.046512,unrelated
