In [4]:
import csv, string, nltk
import pandas as pd
import numpy as np

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse import csr_matrix
from scipy.stats import wasserstein_distance

### Preprocess the data for TF-IDF 
1. Lowercase and apply utf-8 conversion
2. Remove the punctuation
3. Tokenize the data
4. Lemmatize
5. Apply stemming

In [5]:
def preprocess(data):
    table = str.maketrans({key: ' ' for key in string.punctuation})
    stop_words = set(stopwords.words('english')) 
    wordnet_lemmatizer = WordNetLemmatizer()
    porter_stemmer = PorterStemmer()
        
    #lowercase applying utf-8 conversion
    data = data.casefold()
    # remove punctuation
    data = data.translate(table) 
    # tokenize the data
    data_tokens = nltk.word_tokenize(data)
    # for each token that is not a stopword, first lemmatize and then stem
    data_tokens = [porter_stemmer.stem(wordnet_lemmatizer.lemmatize(t, pos="v")) for t in data_tokens 
                  if t not in stop_words]
    
    return ' '.join(data_tokens)

### Get the columns data match a specific data type
data - pandas DataFrame 

t - python data type

In [6]:
def get_type_columns(data, t):
    columns = data.columns

    result = []
    for c in columns:
           if type(data[c][0]) is t:
                result.append(c)
    return result

### TF-IDF and cosine similarity between columns of 2 datasets
Given 2 datasets, compute the cosine similarity column by column (Each columns from corpus1 with each column for corpus2)

In [7]:
def tf_idf_cos_sim_by_col(corpus1, corpus2):
    vectorizer = TfidfVectorizer(preprocessor=preprocess)
    columns = corpus1.columns
    other_columns = corpus2.columns

    result = {}
    for c0 in columns:
        similarities = []
#         print(c0)
        data0 = corpus1[c0].astype('U').tolist()
        for c1 in other_columns:
            print('\t'+c1)
            data1 = corpus2[c1].astype('U').tolist()
            doc = data0 + data1
            vectorizer_train = vectorizer.fit(doc)
            X = vectorizer_train.transform(doc)
            sim = cosine_similarity(X[0], X[1])
            similarities.append(sim[0])
        result[c0] = similarities
    return result

### TF-IDF and cosine similarity between rows of 2 columns
Given 2 columns, compute the cosine similarity row by row.
Compare each row from column1 with each row for column2. If the cosine similarity is bigger than the threshold, append the result and move to the next row. The algorithm can also "learn" how big the difference between the columns is (e.g. row1 from column1 matches with row150 from column2 and row2 from column1 matches with row160 from column2, then the row3 will start the search from index = (((150-1) + (160-2)) / 2) * factor, where factor = 1.3 or any other number). 

In [8]:
def tf_idf_cos_sim_by_row(column1, column2, threshold):
    vectorizer = TfidfVectorizer(preprocessor=preprocess, stop_words=None)
    
    result = []
#     for data in column1:
#       corpus = pd.concat([column2, pd.Series(data)])
#       vectorizer_train = vectorizer.fit(corpus)
#       X = vectorizer_train.transform(corpus)
#       sim = cosine_similarity(X, X)
#       cos_sim_data = sim[sim.shape[0]-1][0:sim.shape[0]-1]
# #       result.append([np.max(cos_sim_data), np.argmax(cos_sim_data)])
#       max_value = np.max(cos_sim_data)
      
#       if max_value > threshold:
#         result.append(max_value)

      
      
    diff = 0 
    for i, data1 in enumerate(column1):
#       print(((i * 100) / len(column1)), '%')
      
        if i - 1.3 * diff > 0:
            j = int(i - 1.3 * diff)
        else:
            j = 0
        
        print(i, j)
        
        for index, data2 in enumerate(column2[j:]):
            corpus = pd.concat([pd.Series(data1), pd.Series(data2)])
            if corpus.empty:
                continue
            try:
                vectorizer_train = vectorizer.fit(corpus)
            except ValueError:
                continue
            X = vectorizer_train.transform(corpus)
            sim = cosine_similarity(X, X)
            cos_sim_data = sim[0][1]

            if cos_sim_data > threshold:
                result.append([index + j, cos_sim_data])
                break
          
        diff += abs(index + j - i)
        diff /= 2
        print('\t', index + j)
        
    return result

### Write a dictonary to csv

In [9]:
def dict_to_csv(dictonary, filename, columns):
    try:
        with open(filename, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(columns)
            for key in list(dictonary):
                data = [r[0] for r in dictonary[key]]
                writer.writerow(data)
    except IOError:
        print("I/O error") 

### Return similar columns based on the column names
Given 2 datasets, return the similar columns based on the column names. The similarity is computed using TF-IDF and cosine similarity. 

In [10]:
def filter_sim_col(data1, data2):
    result = {}
    columns = (data1.columns).tolist()
    other_columns = (data2.columns).tolist()
    corpus = columns + other_columns

    vectorizer = TfidfVectorizer(preprocessor=preprocess)
    vectorizer_train = vectorizer.fit(corpus)
    X = vectorizer_train.transform(corpus)

    sim = cosine_similarity(X, X)
    A = csr_matrix(sim)   
    rows, columns = A.nonzero()
    
    for r, c in zip(rows, columns):
        if r == c:
            continue
            
        if sim[r][c] > 0.5:
            result[corpus[r]] = corpus[c]
        
    return result

### Compute Jaccard similarity betwen 2 columns
Given 2 columns, compute the jaccard similarity (each row from column1 with each row from column2)

Return: A matrix of len(column1) x len(column2) elements. Element i,j represents the distance between row i from column1 and row j from column2.

In [23]:
def jaccard_sim_row(column1, column2):
    result = []
    for c1 in column1: 
#         print(c1)
        c1 = set([c1])
        jds = []
        for c2 in column2:
            c2 = set([c2])
            jds.append(nltk.jaccard_distance(c1, c2))
        result.append(jds)
    return result

### Find similar numerical columns

In [12]:
def find_sim_num_cols(data1, data2):
    str_columns1 = get_type_columns(data1, float)
    str_columns1 += get_type_columns(data1, int)
    str_columns1 += get_type_columns(data1, np.float64)

    str_columns2 = get_type_columns(data2, int)
    str_columns2 += get_type_columns(data2, float)
    str_columns2 += get_type_columns(data2, np.float64)

    sim_cols = filter_sim_col(data1, data2)

    for key, value in sim_cols.items():
        if len(str_columns1) == 0 and len(str_columns2) > 0:
            if key in str_columns2:
                return [key, value]
        
        if key in str_columns1 and value in str_columns2: 
            return [key, value]
        elif key in str_columns2 and value in str_columns1:
            return [value , key]

# Example

### Read data

In [13]:
# Read data
data_imdb = pd.read_csv('movies3/csv_files/imdb.csv')
data_rt = pd.read_csv('movies3/csv_files/rotten_tomatoes.csv')

# Clean data
data_imdb = data_imdb.fillna(0)
data_rt = data_rt.fillna(0)
data_rt = data_rt.replace({'Rating': ['N', '.']}, {'Rating': 0})

# Store data for future processing 
data1 = data_imdb
data2 = data_rt

### Find the similar numerical columns

In [12]:
numerical_cols = find_sim_num_cols(data1, data2)
print(numerical_cols)

['Rating', 'Rating']


### Compute EMD on the similar colums

In [13]:
emd = wasserstein_distance(data1[numerical_cols[0]], data2[numerical_cols[1]])
print(emd)

1.331381641581892


### Find non-numerical similar columns

In [14]:
similar_columns = filter_sim_col(data1, data2)
del similar_columns[numerical_cols[0]]

print(similar_columns)

{'ID': 'ID', 'Title': 'Title', 'Year': 'Year', 'Director': 'Director', 'Creators': 'Creators', 'Cast': 'Cast', 'Genre': 'Genre', 'Duration': 'Duration', 'ContentRating': 'ContentRating', 'Summary': 'Summary'}


### Compute Jaccard similarity between 2 columns

In [25]:
t1 = timer()
result_jd = jaccard_sim_row(data1['Title'], data2['Title'])
t2 = timer()

print(t2-t1)

9.59807987399995


### Test the tf-idf cosine similarity

Note: not working good for now

In [16]:
start = timer()
columns = dict((key,value) for key, value in similar_columns.items() if key == 'Title')
print(columns)

map_sim = {}
for k, v in columns.items():
    if k == 'Id':
        continue
    print(k)
    map_sim[k] = tf_idf_cos_sim_by_row(data1[k][0:2000], data2[v][0:2000], 0.5)
#     max_val = 0
#     for r in result_row:
#         val = (r - np.eye(len(r))).max()
#         if val > max_val:
#             max_val = val
#     print(max_val)
#     map_sim[k] = max_val
end = timer()
print(end-start)

{'Title': 'Title'}
Title


KeyboardInterrupt: 

In [26]:
from datasketch import MinHash
from timeit import default_timer as timer


data11 = data1['Title']
data22 = data2['Title']

t1 = timer()
m1, m2 = MinHash(), MinHash()
for d in data11:
    m1.update(d.encode('utf8'))
for d in data22:
    m2.update(d.encode('utf8'))
t2 = timer()
print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))
print(t2-t1)

Estimated Jaccard for data1 and data2 is 0.6328125
0.09804414900008851


In [43]:
# print(data1.columns)
# print(data2.columns)
for i, d in enumerate(data1['Title'][300:]):
    print(i + 300)

300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549


In [39]:
# np.mean([np.mean(j) for j in result_jd])

for c1 in data1.columns:
    print(c1)
    for c2 in data2.columns:
        print(nltk.jaccard_distance(set(data1[c1]), set(data2[c2])))

Index(['ID', 'Title', 'Year', 'Rating', 'Director', 'Creators', 'Cast',
       'Genre', 'Duration', 'ContentRating', 'Summary'],
      dtype='object')
Index(['ID', 'Title', 'Year', 'Rating', 'Director', 'Creators', 'Cast',
       'Genre', 'Duration', 'ContentRating', 'Summary'],
      dtype='object')
ID
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
Title
1.0
0.40877659574468084
1.0
0.9996667777407531
0.9997986712301188
1.0
1.0
1.0
1.0
1.0
1.0
Year
1.0
0.9997725722083238
0.9996253278381416
0.9992937853107344
0.9997043169722057
0.9997458703939008
0.9997712194005949
0.9994036970781157
0.9993265993265993
1.0
0.9997712717291857
Rating
1.0
1.0
0.9992836676217765
0.993006993006993
0.9995258416311048
0.9996243425995492
0.999677211103938
0.9975247524752475
0.9952830188679245
1.0
0.9996773152629881
Director
1.0
0.9997962925239356
0.9996856334486011
0.9994813278008299
0.43637093536732235
0.8902195608782435
0.998975199836032
0.9995431703974418
0.99949924887331
1.0
0.9997952497952498
Creators
1.0
1.0