In [1]:
import csv, string, nltk
import pandas as pd
import numpy as np

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse import csr_matrix
from scipy.stats import wasserstein_distance

### Preprocess the data for TF-IDF 
1. Lowercase and apply utf-8 conversion
2. Remove the punctuation
3. Tokenize the data
4. Lemmatize
5. Apply stemming

In [2]:
def preprocess(data):
    table = str.maketrans({key: ' ' for key in string.punctuation})
    stop_words = set(stopwords.words('english')) 
    wordnet_lemmatizer = WordNetLemmatizer()
    porter_stemmer = PorterStemmer()
        
    #lowercase applying utf-8 conversion
    data = data.casefold()
    # remove punctuation
    data = data.translate(table) 
    # tokenize the data
    data_tokens = nltk.word_tokenize(data)
    # for each token that is not a stopword, first lemmatize and then stem
    data_tokens = [porter_stemmer.stem(wordnet_lemmatizer.lemmatize(t, pos="v")) for t in data_tokens 
                  if t not in stop_words]
    
    return ' '.join(data_tokens)

### Get the columns data match a specific data type
data - pandas DataFrame 

t - python data type

In [3]:
def get_type_columns(data, t):
    columns = data.columns

    result = []
    for c in columns:
           if type(data[c][0]) is t:
                result.append(c)
    return result

### TF-IDF and cosine similarity between columns of 2 datasets
Given 2 datasets, compute the cosine similarity column by column (Each columns from corpus1 with each column for corpus2)

In [4]:
def tf_idf_cos_sim_by_col(corpus1, corpus2):
    vectorizer = TfidfVectorizer(preprocessor=preprocess)
    columns = corpus1.columns
    other_columns = corpus2.columns

    result = {}
    for c0 in columns:
        similarities = []
#         print(c0)
        data0 = corpus1[c0].astype('U').tolist()
        for c1 in other_columns:
            print('\t'+c1)
            data1 = corpus2[c1].astype('U').tolist()
            doc = data0 + data1
            vectorizer_train = vectorizer.fit(doc)
            X = vectorizer_train.transform(doc)
            sim = cosine_similarity(X[0], X[1])
            similarities.append(sim[0])
        result[c0] = similarities
    return result

### TF-IDF and cosine similarity between rows of 2 columns
Given 2 columns, compute the cosine similarity row by row (Each row from columns1 with each row for column2)

In [5]:
def tf_idf_cos_sim_by_row(column1, column2):
    vectorizer = TfidfVectorizer(preprocessor=preprocess)
    
    result = []
    for data in column1:
        corpus = data + column2
        vectorizer_train = vectorizer.fit(corpus)
        X = vectorizer_train.transform(corpus)
        sim = cosine_similarity(X, X)
        result.append(sim)
        
    return result

### Write a dictonary to csv

In [6]:
def dict_to_csv(dictonary, filename, columns):
    try:
        with open(filename, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(columns)
            for key in list(dictonary):
                data = [r[0] for r in dictonary[key]]
                writer.writerow(data)
    except IOError:
        print("I/O error") 

### Return similar columns based on the column names
Given 2 datasets, return the similar columns based on the column names. The similarity is computed using TF-IDF and cosine similarity. 

In [7]:
def filter_sim_col(data1, data2):
    result = {}
    columns = (data1.columns).tolist()
    other_columns = (data2.columns).tolist()
    corpus = columns + other_columns

    vectorizer = TfidfVectorizer(preprocessor=preprocess)
    vectorizer_train = vectorizer.fit(corpus)
    X = vectorizer_train.transform(corpus)

    sim = cosine_similarity(X, X)
    A = csr_matrix(sim)   
    rows, columns = A.nonzero()
    
    for r, c in zip(rows, columns):
        if r == c:
            continue
            
        if sim[r][c] > 0.5:
            result[corpus[r]] = corpus[c]
        
    return result

### Compute Jaccard similarity betwen 2 columns
Given 2 columns, compute the jaccard similarity (each row from column1 with each row from column2)

Return: A matrix of len(column1) x len(column2) elements. Element i,j represents the distance between row i from column1 and row j from column2.

In [8]:
def jaccard_sim_row(column1, column2):
    result = []
    for c1 in column1: 
#         print(c1)
        c1 = set(c1)
        jds = []
        for c2 in column2:
            c2 = set(c2)
            jds.append(nltk.jaccard_distance(c1, c2))
        result.append(jds)
    return result

### Find similar numerical columns

In [9]:
def find_sim_num_cols(data1, data2):
    str_columns1 = get_type_columns(data1, float)
    str_columns1 += get_type_columns(data1, int)
    str_columns1 += get_type_columns(data1, np.float64)

    str_columns2 = get_type_columns(data2, int)
    str_columns2 += get_type_columns(data2, float)
    str_columns2 += get_type_columns(data2, np.float64)

    sim_cols = filter_sim_col(data1, data2)

    for key, value in sim_cols.items():
        if len(str_columns1) == 0 and len(str_columns2) > 0:
            if key in str_columns2:
                return [key, value]
        
        if key in str_columns1 and value in str_columns2: 
            return [key, value]
        elif key in str_columns2 and value in str_columns1:
            return [value , key]

# Example

### Read data

In [10]:
# Read data
data_imdb = pd.read_csv('movies3/csv_files/imdb.csv')
data_rt = pd.read_csv('movies3/csv_files/rotten_tomatoes.csv')

# Clean data
data_imdb = data_imdb.fillna(0)
data_rt = data_rt.fillna(0)
data_rt = data_rt.replace({'Rating': ['N', '.']}, {'Rating': 0})

# Store data for future processing 
data1 = data_imdb
data2 = data_rt

### Find the similar numerical columns

In [11]:
numerical_cols = find_sim_num_cols(data1, data2)
print(numerical_cols)

['Rating', 'Rating']


### Compute EMD on the similar colums

In [12]:
emd = wasserstein_distance(data1[numerical_cols[0]], data2[numerical_cols[1]])
print(emd)

1.331381641581892


### Find non-numerical similar columns

In [13]:
similar_columns = filter_sim_col(data1, data2)
del similar_columns[numerical_cols[0]]

print(similar_columns)

{'ID': 'ID', 'Title': 'Title', 'Year': 'Year', 'Director': 'Director', 'Creators': 'Creators', 'Cast': 'Cast', 'Genre': 'Genre', 'Duration': 'Duration', 'ContentRating': 'ContentRating', 'Summary': 'Summary'}


### Compute Jaccard similarity between 2 columns

In [15]:
result_jd = jaccard_sim_row(data1['Title'], data2['Title'])

### Test the tf-idf cosine similarity

Note: not working good for now

In [None]:
columns = dict((key,value) for key, value in similar_columns.items() if key == 'Title')
print(columns)

map_sim = {}
for k, v in columns.items():
    if k == 'Id':
        continue
    print(k)
    result_row = tf_idf_cos_sim_by_row(data1[k], data2[v])
    max_val = 0
    for r in result_row:
        val = (r - np.eye(len(r))).max()
        if val > max_val:
            max_val = val
    print(max_val)
    map_sim[k] = max_val

{'Title': 'Title'}
Title
