In [13]:
import pandas as pd
import numpy as np
import nltk
#nltk.download("stopwords") # run once
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer as Snowball
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

info_train_file = 'ItemInfo_train.csv'
pair_train_file = 'ItemPairs_train.csv'
pair_demo_file = 'demoPairs_train.csv' #faked data

def show_df(file):
    return pd.read_csv(file, sep=',', nrows=10)

#stemmer = Snowball("russian")
stemmer = Snowball("english")

russian_stopwords=stopwords.words("english")
#russian_stopwords=stopwords.words("russian")

def proctext(count, text):
    
    print(count, type(text))
    
    wordlist = str(text).lower().translate(str.maketrans('', '', punctuation)).split() # maketrans python 3
    wordlist1 = [word for word in wordlist if word not in russian_stopwords and word not in punctuation]
    wordlist2 = [stemmer.stem(word) for word in wordlist1]
    return ' '.join(wordlist2)

### Vectorize the Bag of Words documents
v = TfidfVectorizer()
c = CountVectorizer()
def vectorize(docs_list, vector_type='tfidf'):
    if vector_type == 'tfidf':
        
        print('Processing Text for TdidfVectorizer...')
        
        proc_docs_list = [proctext(i, d) for i, d in enumerate(docs_list)]
        
        print('Applying TfidfVectorizer...')
        
        return v.fit_transform(proc_docs_list)
    elif vector_type == 'c':
        
        print('Applying CountVectorizer...')
        
        str(docs_list)
        return c.fit_transform(docs_list)
    else:
        print('VectorTypeError')

def cosine_similarity(v1,v2):
    '''cosine_similarity(transformed_docs[2], transformed_docs[2])'''
    ## Idk why need to np.squeeze (1,148) into (148,) shape to dot product [error: shapes not aligned]
    ## toarray() [error: dimension mismatch]
    
    print('Calculating Cosine Similarity...')
    
    v1 = np.squeeze(v1.toarray())
    v2 = np.squeeze(v2.toarray())
    return np.dot(v1,v2) / ( np.sqrt(np.dot(v1,v1)) * np.sqrt(np.dot(v2,v2)) )    

demo_info = show_df(info_train_file)
demo_pair = show_df(pair_demo_file)

In [18]:
def make_info_feats(df):
    
    print('Handling Titles...')
    
    df['tfidf_title']=list(vectorize(df['title']))
    
    print('Handling Descriptions...')
    
    df['tfidf_desc']=list(vectorize(df['description']))
    
    print('Handling Categories...')
    
    df['cv_category']=list(vectorize(df['attrsJSON'], 'c'))
    
    print('make_info_feats completed.')

make_info_feats(demo_info)
demo_info

Handling Titles...
Processing Text for TdidfVectorizer...
0 <class 'str'>
1 <class 'str'>
2 <class 'str'>
3 <class 'str'>
4 <class 'str'>
5 <class 'str'>
6 <class 'str'>
7 <class 'str'>
8 <class 'str'>
9 <class 'str'>
Applying TfidfVectorizer...
Handling Descriptions...
Processing Text for TdidfVectorizer...
0 <class 'str'>
1 <class 'str'>
2 <class 'str'>
3 <class 'str'>
4 <class 'str'>
5 <class 'str'>
6 <class 'str'>
7 <class 'str'>
8 <class 'str'>
9 <class 'str'>
Applying TfidfVectorizer...
Handling Categories...
Applying CountVectorizer...
make_info_feats completed.


Unnamed: 0,itemID,categoryID,title,description,images_array,attrsJSON,price,locationID,metroID,lat,lon,tfidf_title,tfidf_desc,cv_category
0,1,81,Продам Камаз 6520,Продам Камаз 6520 20 тонн,"1064094, 5252822, 6645873, 6960145, 9230265","{""Вид техники"":""Грузовики""}",300000.0,648140,,64.686946,30.815924,"(0, 5)\t0.5773502691896257\n (0, 23)\t0.577...","(0, 208)\t0.4472135954999579\n (0, 7)\t0.44...","(0, 15)\t1\n (0, 57)\t1\n (0, 19)\t1"
1,3,14,Yamaha r6,Весь в тюнинге.,"11919573, 14412228, 3204180, 6646877","{""Вид техники"":""Мотоциклы"", ""Вид мотоцикла"":""С...",300000.0,639040,,55.678037,37.256548,"(0, 14)\t0.7071067811865475\n (0, 20)\t0.70...","(0, 211)\t0.7071067811865475\n (0, 81)\t0.7...","(0, 15)\t2\n (0, 57)\t1\n (0, 34)\t1\n (0..."
2,4,84,iPhone 3gs 8gb,"Телефон в хорошем состоянии, трещин и сколов н...","14384831, 6102021","{""Вид телефона"":""iPhone""}",3500.0,640650,,56.239398,43.460458,"(0, 6)\t0.5773502691896257\n (0, 3)\t0.5773...","(0, 178)\t0.17511088473972605\n (0, 92)\t0....","(0, 15)\t1\n (0, 56)\t1\n (0, 5)\t1"
3,7,84,Xiaomi Mi4 3гб RAM + 16гб ROM белый,"Отличный подарок на новый год от ""китайской ap...",,"{""Вид телефона"":""Другие марки""}",13500.0,662210,,55.77717,37.586194,"(0, 21)\t0.3779644730092272\n (0, 16)\t0.37...","(0, 24)\t0.08889272466298233\n (0, 4)\t0.08...","(0, 15)\t1\n (0, 56)\t1\n (0, 22)\t1\n (0..."
4,8,39,Лыжные ботинки,"Лыжные ботинки в хорошем состоянии, 34 размер","13718854, 4787310","{""Вид товара"":""Зимние виды спорта""}",500.0,624360,,55.77717,37.586194,"(0, 22)\t0.6476888299953735\n (0, 24)\t0.76...","(0, 183)\t0.3844001827986375\n (0, 17)\t0.4...","(0, 15)\t1\n (0, 59)\t1\n (0, 24)\t1\n (0..."
5,9,39,Сноуборд ботинки Nitro Team 10 us,"сноубордические ботинки Nitro Team\nразмер 42,...","12418395, 9930491","{""Вид товара"":""Зимние виды спорта""}",7000.0,644200,,58.004785,56.237654,"(0, 18)\t0.41802398937415175\n (0, 0)\t0.41...","(0, 149)\t0.32538076593964255\n (0, 68)\t0....","(0, 15)\t1\n (0, 59)\t1\n (0, 24)\t1\n (0..."
6,12,9,"LADA Priora, 2015",Машина новая пробег реальный. Не битая не краш...,"1338189, 1648456, 6321889, 9883716","{""Марка"":""ВАЗ (LADA)"", ""Модель"":""Priora"", ""Тип...",445000.0,631060,,44.219841,42.058825,"(0, 2)\t0.5773502691896257\n (0, 13)\t0.577...","(0, 151)\t0.2780440474826119\n (0, 127)\t0....","(0, 29)\t1\n (0, 13)\t1\n (0, 6)\t1\n (0,..."
7,15,32,Телевизор,"Телевизоры кинескопные, диагональ от 37 см. до...","11244051, 14467554, 2240467, 5099565, 8002433","{""Вид товара"":""Телевизоры и проекторы""}",1600.0,662810,,57.622434,39.887894,"(0, 30)\t1.0","(0, 79)\t0.1140873858555165\n (0, 176)\t0.1...","(0, 15)\t1\n (0, 59)\t1\n (0, 55)\t1\n (0..."
8,16,27,Шуба мутоновая,"качественная, производство Казань, хорошее сос...","11762574, 316289, 4015142","{""Вид одежды"":""Женская одежда"", ""Предмет одежд...",1000.0,657600,,56.495116,84.972128,"(0, 26)\t0.7071067811865475\n (0, 31)\t0.70...","(0, 224)\t0.17335477950740374\n (0, 148)\t0...","(0, 15)\t1\n (0, 39)\t2\n (0, 23)\t1\n (0..."
9,19,88,Массажная накидка Beurer mg,Распродаем остатки. Все модели хиты продаж!!! ...,9722988,"{""Вид товара"":""Приборы и аксессуары""}",5000.0,637640,500701.0,55.640538,37.606065,"(0, 10)\t0.5\n (0, 7)\t0.5\n (0, 27)\t0.5\...","(0, 31)\t0.20485558012199426\n (0, 35)\t0.2...","(0, 15)\t1\n (0, 59)\t1\n (0, 43)\t1\n (0..."


In [19]:
def make_value_list(df, id_list, column_name):
    '''Searches info table for matching list of itemID and column_name. Returns list.'''
    
    print('Searching for ItemID with Column:', column_name, '...')
    
    return [df.loc[df['itemID']==i, column_name].values[0] for i in list(id_list)]

def make_pair_feats(df, df_info):
    id_1 = df['itemID_1']
    id_2 = df['itemID_2']
    
    print('Making Integer features...')
    
    i_feats = ['price','locationID']
    for i in i_feats:
        id_1_feat = make_value_list(df_info,id_1,i)
        id_2_feat = make_value_list(df_info,id_2,i)
        dif_feat_list = [abs(x-y) for x, y in zip(id_1_feat,id_2_feat)]
        new_column_name = 'dif_'+i
        df[new_column_name]=dif_feat_list
    
    print('Making Vector features...')
    
    v_feats = ['tfidf_title','tfidf_desc','cv_category']
    for f in v_feats:
        id_1_feat = make_value_list(df_info,id_1,f)
        id_2_feat = make_value_list(df_info,id_2,f)
        cos_feat_list = [cosine_similarity(x,y) for x, y in zip(id_1_feat,id_2_feat)]
        new_column_name = 'cos_'+f
        df[new_column_name]=cos_feat_list
        
    print('make_pair_feats completed.')
        
make_pair_feats(demo_pair, demo_info)
demo_pair

Making Integer features...
Searching for ItemID with Column: price ...
Searching for ItemID with Column: price ...
Searching for ItemID with Column: locationID ...
Searching for ItemID with Column: locationID ...
Making Vector features...
Searching for ItemID with Column: tfidf_title ...
Searching for ItemID with Column: tfidf_title ...
Calculating Cosine Similarity...
Calculating Cosine Similarity...
Calculating Cosine Similarity...
Calculating Cosine Similarity...
Calculating Cosine Similarity...
Searching for ItemID with Column: tfidf_desc ...
Searching for ItemID with Column: tfidf_desc ...
Calculating Cosine Similarity...
Calculating Cosine Similarity...
Calculating Cosine Similarity...
Calculating Cosine Similarity...
Calculating Cosine Similarity...
Searching for ItemID with Column: cv_category ...
Searching for ItemID with Column: cv_category ...
Calculating Cosine Similarity...
Calculating Cosine Similarity...
Calculating Cosine Similarity...
Calculating Cosine Similarity...
C

Unnamed: 0,itemID_1,itemID_2,isDuplicate,generationMethod,dif_price,dif_locationID,cos_tfidf_title,cos_tfidf_desc,cos_cv_category
0,1,3,1,1,0.0,9100,0.0,0.0,0.612372
1,4,8,1,1,3000.0,16290,0.0,0.114444,0.258199
2,3,7,0,1,286500.0,23170,0.0,0.0,0.353553
3,1,12,0,1,145000.0,17080,0.0,0.0,0.0
4,12,15,1,1,443400.0,31750,0.0,0.0,0.0


In [None]:
## Import Info Train df  ##
train_info = pd.read_csv(info_train_file, sep=',', usecols=['itemID','title','description','attrsJSON','price','locationID'], dtype={'itemID':'int32', 'title':'str', 'description':'str','attrsJSON':'str','price':'float32','locationID':'int32'})

train_info['title'].fillna('NaN', inplace=True)
train_info['description'].fillna('NaN', inplace=True)
train_info['attrsJSON'].fillna('NaN', inplace=True)
train_info['price'].fillna(train_info['price'].median(), inplace=True)
train_info['locationID'].fillna(train_info['locationID'].median(), inplace=True)

# Check type
print(train_info.dtypes)
train_info.head(15)
#train_info.tail(10)

In [None]:
### To get rough idea of size of entires ###

count = 0
with open(train_info, 'r', encoding='utf_8') as f:
    for line in f:
        count+=1
print('No. of Entries:', count)
import sys
print('Size:', round(sys.getsizeof(trainset) / 1e9, 1), 'GB')

In [None]:
## Import Pair Train df  ##
train_pair = pd.read_csv(pair_train_file, sep=',', dtype={'itemID':'int32'})
train_pair.head(2)

In [None]:
train_info.title=train_info.title.astype(str)
train_info.description=train_info.description.astype(str)

make_info_feats(train_info)
train_info.head(2)

In [None]:
make_pair_feats(train_pair)
train_pair.head(2)

In [None]:
### Visualizing Vectors in table form ###

features = v.get_feature_names()
def tfidf_table(Xtr=transformed_docs, features=features, doc=0, top_n=25, step=-1):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[doc].toarray())  # squeeze removes exta array in array [[]]
    
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::step][:top_n] #argsort returns index of ordered, step -1 is reverse
    
    #for i in topn_ids:
        #print((features[i], row[i]))
        
    top_feats = [(features[i], row[i]) for i in topn_ids if row[i]!=0]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df


#print(tfidf_table(doc=1, step=1, top_n=900))
print(tfidf_table(doc=1))