# Import libraries

In [34]:
import pandas as pd
import numpy as np

from tqdm import tqdm

## Read data

In [35]:
uns = pd.read_csv('uns_df.csv', index_col=0)
isri = pd.read_csv('isri_df.csv', index_col=0)
process = pd.read_csv('process_df.csv', index_col=0)
anamet = pd.read_csv('anamet_df.csv', index_col=0)
materials = pd.read_csv('materials_df.csv', index_col=0)

# New *NIKOS* data

In [36]:
xls = pd.ExcelFile('data/edited_nikos_data.xlsx')

In [37]:
uns_data = pd.read_excel(xls, 'UNS data')

uns_data.rename(columns = {'sc:isComposedOf':'elements id'}, inplace = True)

elements_data = pd.read_excel(xls, 'Elements data')

In [38]:
uns_data = pd.merge(uns_data, elements_data, how='inner', on = 'elements id')

In [39]:
isri_data = pd.read_excel(xls, 'Isri data')

In [40]:
anamet_data = pd.read_excel(xls, 'Anamet data')

# NLP

In [41]:
from gensim.utils import simple_preprocess

Keep only `anamets` with english description

## ANAMET `anamet_data`

In [42]:
# Keep only anamets with english sc:descritpion
import unicodedata as ud

latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha()) # isalpha suggested by John Machin

e_description_index = []
for row in range(len(anamet_data)):
    if only_roman_chars(str(anamet_data['sc:description'].iloc[row])):
        e_description_index.append(row)
        
anamet_data = anamet_data.iloc[e_description_index]
anamet_data

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
28,sc:ANAMET_465,COPPER SCRAP MILLBERRY CABLES FOR STRIP.,Non-Ferrous,40014,
30,sc:ANAMET_466,COPPER SCRAP BIRCH CABLES FOR STRIPPING,Non-Ferrous,40015,
33,sc:ANAMET_467,MIXED COPPER CABLES SCRAP( RAW ),Non-Ferrous,40091,
34,sc:ANAMET_468,CABLES COPPER - LEAD(RAW),Non-Ferrous,40133,
37,sc:ANAMET_469,COPPER SCRAP BERRY/CANDY AS PER ISR(RAW),Non-Ferrous,40210,
...,...,...,...,...,...
317,sc:ANAMET_684,LEAD SCRAP (E),Non-Ferrous,50185,
318,sc:ANAMET_685,ELECTRIC MOTORS SHREDDED (WEEE),Non-Ferrous,50259,
319,sc:ANAMET_686,BLACK PLASTIC SCRAP (WEEE) ( E ),Non-Ferrous,50117,
320,sc:ANAMET_687,SCRAP PLEXIGLASS ( E ),Non-Ferrous,50118,


Preprocess text data using `gensim.utils.simple_preprocess()`

In [43]:
anamet_data['sc:description'] = anamet_data['sc:description'].apply(simple_preprocess)
anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
28,sc:ANAMET_465,"[copper, scrap, millberry, cables, for, strip]",Non-Ferrous,40014,
30,sc:ANAMET_466,"[copper, scrap, birch, cables, for, stripping]",Non-Ferrous,40015,


Delete word `scrap` from description

In [44]:
# Delete word scrap
for i in range(len(anamet_data)):
    if 'scrap' in anamet_data['sc:description'].iloc[i]:
        anamet_data['sc:description'].iloc[i].remove('scrap')

In [45]:
anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
28,sc:ANAMET_465,"[copper, millberry, cables, for, strip]",Non-Ferrous,40014,
30,sc:ANAMET_466,"[copper, birch, cables, for, stripping]",Non-Ferrous,40015,


## UNS `uns_data`

In [46]:
uns_data['hasName'] = uns_data['hasName'].apply(simple_preprocess)
uns_data.head(2)

Unnamed: 0,uns id,sc:hasName,elements id,ar,atomicNumber,density,meltingPoint,hasName,symbol
0,sc:UNS_712,C83300,sc:Elements_329,207.2,82,11.29,327.5,[lead],Pb
1,sc:UNS_733,C84500,sc:Elements_329,207.2,82,11.29,327.5,[lead],Pb


Θέλουμε να συσχετίσουμε τα `anamet` με τα `uns`. Οι μόνες πληροφορίες που έχουμε (features) είναι το `sc:description` και `hasName`. Θα εφαρμόσουμε NLP και ειδικότερα Word2Vec για ομοιότητες μεταξύ των texts.

In [47]:
print(anamet_data['anamet entity'].nunique(), "unique anamet ids")
print(uns_data['uns id'].nunique(), "unique uns ids")

214 unique anamet ids
912 unique uns ids


### Try Word2Vec stuff

In [48]:
test_anamet = anamet_data.iloc[120]
test_anamet

anamet entity                sc:ANAMET_587
sc:description      [mixed, plastic, weee]
sc:hasCategory                 Non-Ferrous
sc:internal_code                     50099
sc:isProcessedBy                       NaN
Name: 220, dtype: object

In [51]:
import gensim
import gensim.downloader

for model_name in list(gensim.downloader.info()['models'].keys()):
    print(model_name)
    
model = gensim.downloader.load('glove-twitter-25')

fasttext-wiki-news-subwords-300
conceptnet-numberbatch-17-06-300
word2vec-ruscorpora-300
word2vec-google-news-300
glove-wiki-gigaword-50
glove-wiki-gigaword-100
glove-wiki-gigaword-200
glove-wiki-gigaword-300
glove-twitter-25
glove-twitter-50
glove-twitter-100
glove-twitter-200
__testing_word2vec-matrix-synopsis


In [52]:
model.similarity('plastic', 'zinc')

0.59558815

In [None]:
anamet_data['sc:description']

In [None]:
for row in range(200, 250):
    w1 = uns_data['hasName'].iloc[row][0]
    
    for j in range(3):
        try:
            w2 = anamet_data['sc:description'].iloc[j]
            print(w1, w2[0], model.similarity(w1, w2[0]))
        except KeyError:
            continue

In [None]:
# NLP preprocess
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

description = anamet_data['sc:description']
description

model = Word2Vec(min_count=1, 
                 vector_size=25, 
                 workers=4, 
                 window=2, 
                 sg=1)

model.build_vocab(description)
model.train(description, total_examples=model.corpus_count,
           epochs=500)

In [None]:
model.wv.most_similar('lead')

In [None]:
anamet_data['sc:description'].iloc[1][0]

In [None]:
model.wv.similarity(anamet_data['sc:description'].iloc[0], 'copper')

In [None]:
import gensim
import gensim.downloader

for model_name in list(gensim.downloader.info()['models'].keys()):
    print(model_name)
    
google_news_vectors = gensim.downloader.load('glove-wiki-gigaword-300')

In [None]:
google_news_vectors.save("pretrained_word2vec.model")

In [None]:
google_news_vectors

In [None]:
not_included = []
included = []
for row in range(len(uns_data)):
    try:
        model.wv.most_similar(uns_data['hasName'].iloc[row])
        included.append(uns_data['hasName'].iloc[row])
    except KeyError: 
        not_included.append(uns_data['hasName'].iloc[row])

pd.DataFrame(not_included)[0].unique()

In [None]:
pd.DataFrame(included)[0].unique()

In [None]:
uns_data['hasName'].value_counts()

In [None]:
elements_data_df = elements_data[['ar', 'atomicNumber', 'density', 'meltingPoint', 'hasName']]
elements_data_df['hasName'] = elements_data_df['hasName'].apply(simple_preprocess)
elements_data_df.head(2)

In [None]:
print(elements_data['hasName'].nunique())

In [None]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()

elements_data_df['hasName_id'] = label_enc.fit_transform(elements_data['hasName'])
elements_data_df.head(2)

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

kmeans = KMeans(n_clusters=4)
kmeans.fit(elements_data_df.drop('hasName', axis=1))

In [None]:
pd.DataFrame(included)[0].unique()

In [None]:
pd.DataFrame(not_included)[0].unique()

In [None]:
yhat = kmeans.predict(elements_data_df.drop('hasName', axis=1))
elements_data_df['cluster'] = yhat
elements_data_df

In [None]:
for elm in pd.DataFrame(included)[0].unique():
    for row in range(len(elements_data_df)):
        if(elements_data_df['hasName'].iloc[row][0] == elm):
            print(elm, elements_data_df['cluster'].iloc[row])

In [None]:
for elm in pd.DataFrame(not_included)[0].unique():
    for row in range(len(elements_data_df)):
        if(elements_data_df['hasName'].iloc[row][0] == elm):
            print(elm, elements_data_df['cluster'].iloc[row])

In [None]:
# for row in range(len(uns_data)):
#     if uns_data['hasName'].iloc[row][0] in pd.DataFrame(not_included)[0].unique():
#         ifuns_data['hasName'].iloc[row][0] == 'tin':
#             uns_data['hasName'].iloc[row][0] = 'lead'
#         if uns_data['hasName'].iloc[row][0] == 'manganese
#             uns_data['hasName'].iloc[row][0] = 'nickel'
#         if uns_data['hasName'].iloc[row][0] == 'tin':
#             uns_data['hasName'].iloc[row][0] = 'lead'



In [None]:
uns_data['hasName'].value_counts()

In [None]:
uns_data['hasName'].iloc[0]

In [None]:
for row in range(len(uns_data)):
    try:
        model.wv.most_similar(uns_data['hasName'].iloc[row])
    except KeyError: 
        print(uns_data['hasName'].iloc[row])