# Import libraries

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

## Read data

In [2]:
uns = pd.read_csv('uns_df.csv', index_col=0)
isri = pd.read_csv('isri_df.csv', index_col=0)
process = pd.read_csv('process_df.csv', index_col=0)
anamet = pd.read_csv('anamet_df.csv', index_col=0)
materials = pd.read_csv('materials_df.csv', index_col=0)

# New *NIKOS* data

In [3]:
xls = pd.ExcelFile('data/edited_nikos_data.xlsx')

In [4]:
uns_data = pd.read_excel(xls, 'UNS data')

uns_data.rename(columns = {'sc:isComposedOf':'elements id'}, inplace = True)

elements_data = pd.read_excel(xls, 'Elements data')

uns_data = pd.merge(uns_data, elements_data, how='inner', on = 'elements id')
isri_data = pd.read_excel(xls, 'Isri data')
anamet_data = pd.read_excel(xls, 'Anamet data')

Keep only `anamets` with english description

In [10]:
# Keep only anamets with english sc:descritpion
import unicodedata as ud

latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha()) # isalpha suggested by John Machin

e_description_index = []
for row in range(len(anamet_data)):
    if only_roman_chars(str(anamet_data['sc:description'].iloc[row])):
        e_description_index.append(row)
        
anamet_data = anamet_data.iloc[e_description_index]

In [5]:
uns_data.head(2)

Unnamed: 0,uns id,sc:hasName,elements id,ar,atomicNumber,density,meltingPoint,hasName,symbol
0,sc:UNS_712,C83300,sc:Elements_329,207.2,82,11.29,327.5,LEAD,Pb
1,sc:UNS_733,C84500,sc:Elements_329,207.2,82,11.29,327.5,LEAD,Pb


In [6]:
isri_data.head(2)

Unnamed: 0,isri id,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasISRICode,sc:hasSpecification,pc:hasName,sc:isComposedOf,sc:isProcessedBy
0,sc:ISRI_0,Nonferrous Scrap - Red Metals,Wire,,Copper,Berry,"Bare, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257
1,sc:ISRI_1,Nonferrous Scrap - Red Metals,Wire and Cable,"Brittle Burnt Wire, Copper Tubing",Copper,Barley,"Clean, Untinned, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257


In [9]:
anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
28,sc:ANAMET_465,COPPER SCRAP MILLBERRY CABLES FOR STRIP.,Non-Ferrous,40014,
30,sc:ANAMET_466,COPPER SCRAP BIRCH CABLES FOR STRIPPING,Non-Ferrous,40015,


In [17]:
print(uns_data.isna().sum(), '\n')
print(isri_data.isna().sum(), '\n')
print(anamet_data.isna().sum(), '\n')

uns id            0
sc:hasName      231
elements id       0
ar                0
atomicNumber      0
density           0
meltingPoint      0
hasName           0
symbol            0
dtype: int64 

isri id                  0
sc:hasCategory           0
sc:hasForm              43
sc:isFreeFrom           87
sc:contains             68
sc:hasISRICode           0
sc:hasSpecification     12
pc:hasName               7
sc:isComposedOf        109
sc:isProcessedBy       194
dtype: int64 

anamet entity         0
sc:description        0
sc:hasCategory        0
sc:internal_code      0
sc:isProcessedBy    209
dtype: int64 



In [22]:
isri_data

Unnamed: 0,isri id,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasISRICode,sc:hasSpecification,pc:hasName,sc:isComposedOf,sc:isProcessedBy
0,sc:ISRI_0,Nonferrous Scrap - Red Metals,Wire,,Copper,Berry,"Bare, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257
1,sc:ISRI_1,Nonferrous Scrap - Red Metals,Wire and Cable,"Brittle Burnt Wire, Copper Tubing",Copper,Barley,"Clean, Untinned, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257
2,sc:ISRI_2,Nonferrous Scrap - Red Metals,"Miscellaneous, Wire","Excessively leaded, tinned, soldered copper wi...",Copper,Birch,Unalloyed,No. 2 Copper Wire,sc:Material_249,sc:Process_257
3,sc:ISRI_3,Nonferrous Scrap - Red Metals,"Clippings, punchings, bus bars, commutator seg...",,Copper,Candy,"clean, unalloyed, uncoated",No. 1 Heavy Copper Solids and Tubing,sc:Material_249,sc:Process_257
4,sc:ISRI_4,Nonferrous Scrap - Red Metals,Miscellaneous Scrap,"excessively leaded, tinned, soldered copper sc...",Copper,Cliff,Unalloyed,No. 2 Copper Solids and Tubing,sc:Material_249,sc:Process_257
...,...,...,...,...,...,...,...,...,...,...
239,sc:ISRI_239,Ferrous scrap - Steel from Scrap Tires,,,,278,Chopped. Less than two percent (<2%) rubber/fi...,Pulled Bead Wire (Ferrous) - Grade 1,,
240,sc:ISRI_240,Ferrous scrap - Steel from Scrap Tires,,,,279,Chopped. Less than five percent (<5%) rubber/f...,Pulled Bead Wire (Ferrous) - Grade 2,,
241,sc:ISRI_241,Ferrous scrap - Steel from Scrap Tires,,,,280,Chopped. Less than five percent (<5%) rubber/f...,Pulled Bead Wire (Ferrous) - Grade 3,,
242,sc:ISRI_242,Ferrous scrap - Steel from Scrap Tires,,,,281,Chopped. Ten to twenty percent (10-20%) rubber...,Pulled Bead Wire (Ferrous) - Grade 4,,


In [24]:
isri_data['sc:isProcessedBy'].value_counts()

sc:Process_257    50
Name: sc:isProcessedBy, dtype: int64

In [23]:
isri_for_ml = isri_data.drop(columns=['isri id', 'pc:hasName', 'sc:isProcessedBy'])

# NLP

In [5]:
from gensim.utils import simple_preprocess

Keep only `anamets` with english description

## ANAMET `anamet_data`

In [6]:
# Keep only anamets with english sc:descritpion
import unicodedata as ud

latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha()) # isalpha suggested by John Machin

e_description_index = []
for row in range(len(anamet_data)):
    if only_roman_chars(str(anamet_data['sc:description'].iloc[row])):
        e_description_index.append(row)
        
anamet_data = anamet_data.iloc[e_description_index]
anamet_data

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
28,sc:ANAMET_465,COPPER SCRAP MILLBERRY CABLES FOR STRIP.,Non-Ferrous,40014,
30,sc:ANAMET_466,COPPER SCRAP BIRCH CABLES FOR STRIPPING,Non-Ferrous,40015,
33,sc:ANAMET_467,MIXED COPPER CABLES SCRAP( RAW ),Non-Ferrous,40091,
34,sc:ANAMET_468,CABLES COPPER - LEAD(RAW),Non-Ferrous,40133,
37,sc:ANAMET_469,COPPER SCRAP BERRY/CANDY AS PER ISR(RAW),Non-Ferrous,40210,
...,...,...,...,...,...
317,sc:ANAMET_684,LEAD SCRAP (E),Non-Ferrous,50185,
318,sc:ANAMET_685,ELECTRIC MOTORS SHREDDED (WEEE),Non-Ferrous,50259,
319,sc:ANAMET_686,BLACK PLASTIC SCRAP (WEEE) ( E ),Non-Ferrous,50117,
320,sc:ANAMET_687,SCRAP PLEXIGLASS ( E ),Non-Ferrous,50118,


Preprocess text data using `gensim.utils.simple_preprocess()`

In [7]:
anamet_data['sc:description'] = anamet_data['sc:description'].apply(simple_preprocess)
anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
28,sc:ANAMET_465,"[copper, scrap, millberry, cables, for, strip]",Non-Ferrous,40014,
30,sc:ANAMET_466,"[copper, scrap, birch, cables, for, stripping]",Non-Ferrous,40015,


Delete word `scrap` and `for` from description

In [8]:
# Delete word scrap
for i in range(len(anamet_data)):
    if 'scrap' in anamet_data['sc:description'].iloc[i]:
        anamet_data['sc:description'].iloc[i].remove('scrap')
    if 'for' in anamet_data['sc:description'].iloc[i]:
        anamet_data['sc:description'].iloc[i].remove('for')

In [9]:
anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
28,sc:ANAMET_465,"[copper, millberry, cables, strip]",Non-Ferrous,40014,
30,sc:ANAMET_466,"[copper, birch, cables, stripping]",Non-Ferrous,40015,


## UNS `uns_data`

In [10]:
uns_data['hasName'] = uns_data['hasName'].apply(simple_preprocess)
uns_data.head(2)

Unnamed: 0,uns id,sc:hasName,elements id,ar,atomicNumber,density,meltingPoint,hasName,symbol
0,sc:UNS_712,C83300,sc:Elements_329,207.2,82,11.29,327.5,[lead],Pb
1,sc:UNS_733,C84500,sc:Elements_329,207.2,82,11.29,327.5,[lead],Pb


Θέλουμε να συσχετίσουμε τα `anamet` με τα `uns`. Οι μόνες πληροφορίες που έχουμε (features) είναι το `sc:description` και `hasName`. Θα εφαρμόσουμε NLP και ειδικότερα Word2Vec για ομοιότητες μεταξύ των texts.

In [11]:
print(anamet_data['anamet entity'].nunique(), "unique anamet ids")
print(uns_data['uns id'].nunique(), "unique uns ids")

214 unique anamet ids
912 unique uns ids


## ISRI `isri_data`

In [12]:
isri_data.head(2)

Unnamed: 0,isri id,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasISRICode,sc:hasSpecification,pc:hasName,sc:isComposedOf,sc:isProcessedBy
0,sc:ISRI_0,Nonferrous Scrap - Red Metals,Wire,,Copper,Berry,"Bare, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257
1,sc:ISRI_1,Nonferrous Scrap - Red Metals,Wire and Cable,"Brittle Burnt Wire, Copper Tubing",Copper,Barley,"Clean, Untinned, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257


In [13]:
isri_data.drop(columns=['sc:isComposedOf', 'sc:isProcessedBy'], inplace=True)

In [14]:
isri_data.isna().sum()


isri id                 0
sc:hasCategory          0
sc:hasForm             43
sc:isFreeFrom          87
sc:contains            68
sc:hasISRICode          0
sc:hasSpecification    12
pc:hasName              7
dtype: int64

In [15]:
isri_data.fillna('nan', inplace=True)
isri_data.isna().sum()

isri id                0
sc:hasCategory         0
sc:hasForm             0
sc:isFreeFrom          0
sc:contains            0
sc:hasISRICode         0
sc:hasSpecification    0
pc:hasName             0
dtype: int64

In [30]:
isri_data['sc:hasCategory'] = isri_data['sc:hasCategory'].apply(simple_preprocess)
isri_data['sc:hasForm'] = isri_data['sc:hasForm'].apply(simple_preprocess)
isri_data['sc:isFreeFrom'] = isri_data['sc:isFreeFrom'].apply(simple_preprocess)
isri_data['sc:contains'] = isri_data['sc:contains'].apply(simple_preprocess)
isri_data['sc:hasISRICode'] = isri_data['sc:hasISRICode'].astype(str).apply(simple_preprocess)
isri_data['sc:hasSpecification'] = isri_data['sc:hasSpecification'].apply(simple_preprocess)
isri_data['pc:hasName'] = isri_data['pc:hasName'].apply(simple_preprocess)

TypeError: decoding to str: need a bytes-like object, list found

In [31]:
isri_data.head(3)

Unnamed: 0,isri id,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasISRICode,sc:hasSpecification,pc:hasName
0,sc:ISRI_0,"[nonferrous, red, metals]",[wire],[nan],[copper],[berry],"[bare, uncoated, unalloyed]","[no, copper, wire]"
1,sc:ISRI_1,"[nonferrous, red, metals]","[wire, and, cable]","[brittle, burnt, wire, copper, tubing]",[copper],[barley],"[clean, untinned, uncoated, unalloyed]","[no, copper, wire]"
2,sc:ISRI_2,"[nonferrous, red, metals]","[miscellaneous, wire]","[excessively, leaded, tinned, soldered, copper...",[copper],[birch],[unalloyed],"[no, copper, wire]"


Delete word `scrap` and `for` from description

In [32]:
# Delete word scrap
for i in range(len(isri_data)):
    if 'scrap' in isri_data['sc:hasCategory'].iloc[i]:
        
        isri_data['sc:hasCategory'].iloc[i].remove('scrap')

[['copper', 'millberry', 'cables', 'strip'],
 ['copper', 'birch', 'cables', 'stripping'],
 ['mixed', 'copper', 'cables', 'raw'],
 ['cables', 'copper', 'lead', 'raw'],
 ['copper', 'berry', 'candy', 'as', 'per', 'isr', 'raw'],
 ['ms', 'raw'],
 ['brass', 'honey', 'as', 'per', 'isri', 'raw'],
 ['aluminum', 'taint', 'tabor', 'raw', 'material'],
 ['aluminum', 'mixed', 'raw', 'material'],
 ['aluminum', 'shred', 'raw', 'material'],
 ['aluminum', 'cables', 'stripping'],
 ['aluminum', 'cables', 'raw'],
 ['aluminum', 'talk', 'as', 'as', 'per', 'isri', 'raw'],
 ['cables', 'alouminium', 'lead', 'raw'],
 ['wire', 'acsr', 'raw'],
 ['cables', 'alouminium', 'copper', 'raw'],
 ['etalbond', 'raw'],
 ['plastic', 'raw', 'material'],
 ['car', 'bumpers', 'raw', 'material'],
 ['gas', 'tanks', 'raw', 'material'],
 ['depolluted', 'small', 'domestic', 'appl', 'weee'],
 ['starters', 'alternators', 'raw'],
 ['copper', 'millberry', 'as', 'per', 'isri'],
 ['copper', 'clove', 'as', 'per', 'isri'],
 ['copper', 'new', 

### Try Word2Vec stuff

In [54]:
import gensim
import gensim.downloader

# for model_name in list(gensim.downloader.info()['models'].keys()):
    # print(model_name)
    
model_pretrained = gensim.downloader.load('glove-twitter-25')
print(model_pretrained)

KeyedVectors<vector_size=25, 1193514 keys>


In [51]:
# NLP preprocess
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

description = anamet_data['sc:description']
description

model_anamet = Word2Vec(min_count=1, 
                 vector_size=25, 
                 workers=4, 
                 window=2, 
                 sg=1)

model_anamet.build_vocab(description)
model_anamet.train(description, total_examples=model_anamet.corpus_count,
           epochs=500)
print(model_anamet)

Word2Vec<vocab=195, vector_size=25, alpha=0.025>


#### Test example

In [50]:
test_anamet = anamet_data.iloc[120]
print(test_anamet)
print('\n')
print(test_anamet['sc:description'])

anamet entity                sc:ANAMET_587
sc:description      [mixed, plastic, weee]
sc:hasCategory                 Non-Ferrous
sc:internal_code                     50099
sc:isProcessedBy                       NaN
Name: 220, dtype: object


['mixed', 'plastic', 'weee']


In [68]:
uns_data

Unnamed: 0,uns id,sc:hasName,elements id,ar,atomicNumber,density,meltingPoint,hasName,symbol
0,sc:UNS_712,C83300,sc:Elements_329,207.200,82,11.29,327.5,[lead],Pb
1,sc:UNS_733,C84500,sc:Elements_329,207.200,82,11.29,327.5,[lead],Pb
2,sc:UNS_738,C85310,sc:Elements_329,207.200,82,11.29,327.5,[lead],Pb
3,sc:UNS_752,C85710,sc:Elements_329,207.200,82,11.29,327.5,[lead],Pb
4,sc:UNS_837,C91500,sc:Elements_329,207.200,82,11.29,327.5,[lead],Pb
...,...,...,...,...,...,...,...,...,...
907,sc:UNS_1657,C50200,sc:Elements_943,32.605,16,2.07,112.8,[sulfur],S
908,sc:UNS_1305,,sc:Elements_943,32.605,16,2.07,112.8,[sulfur],S
909,sc:UNS_1654,,sc:Elements_943,32.605,16,2.07,112.8,[sulfur],S
910,sc:UNS_1767,C70280,sc:Elements_946,10.811,5,2.34,2076.0,[boron],B


In [70]:
for row in range(len(uns_data)):
    w1 = uns_data['hasName'].iloc[row][0]
    w2 = 'copper'
    try:
        sim = model_anamet.wv.similarity(w1, w2)
    except KeyError:
        sim = model_pretrained.similarity(w1, w2)
    if sim >= .7: print(w1, w2, sim, uns_data['uns id'].iloc[row])

copper copper 1.0 sc:UNS_713
copper copper 1.0 sc:UNS_718
copper copper 1.0 sc:UNS_721
copper copper 1.0 sc:UNS_722
copper copper 1.0 sc:UNS_734
copper copper 1.0 sc:UNS_735
copper copper 1.0 sc:UNS_736
copper copper 1.0 sc:UNS_737
copper copper 1.0 sc:UNS_740
copper copper 1.0 sc:UNS_743
copper copper 1.0 sc:UNS_744
copper copper 1.0 sc:UNS_747
copper copper 1.0 sc:UNS_749
copper copper 1.0 sc:UNS_751
copper copper 1.0 sc:UNS_753
copper copper 1.0 sc:UNS_759
copper copper 1.0 sc:UNS_761
copper copper 1.0 sc:UNS_764
copper copper 1.0 sc:UNS_766
copper copper 1.0 sc:UNS_767
copper copper 1.0 sc:UNS_769
copper copper 1.0 sc:UNS_772
copper copper 1.0 sc:UNS_773
copper copper 1.0 sc:UNS_774
copper copper 1.0 sc:UNS_776
copper copper 1.0 sc:UNS_778
copper copper 1.0 sc:UNS_780
copper copper 1.0 sc:UNS_783
copper copper 1.0 sc:UNS_784
copper copper 1.0 sc:UNS_791
copper copper 1.0 sc:UNS_792
copper copper 1.0 sc:UNS_793
copper copper 1.0 sc:UNS_796
copper copper 1.0 sc:UNS_802
copper copper 

KeyError: "Key 'antimony' not present"

In [None]:
model_anamet.wv.similarity('tin', test_anamet['sc:description'][0])

In [None]:
model_pretrained.similarity('tin', test_anamet['sc:description'][0])

In [None]:
anamet_data['sc:description']

In [None]:
for row in range(200, 250):
    w1 = uns_data['hasName'].iloc[row][0]
    
    for j in range(3):
        try:
            w2 = anamet_data['sc:description'].iloc[j]
            print(w1, w2[0], model.similarity(w1, w2[0]))
        except KeyError:
            continue

# Clustering chemical elements

In [None]:
elements_data_df = elements_data[['ar', 'atomicNumber', 'density', 'meltingPoint', 'hasName']]
elements_data_df['hasName'] = elements_data_df['hasName'].apply(simple_preprocess)
elements_data_df.head(2)

In [None]:
print(elements_data.shape)
print(elements_data['hasName'].nunique())

Elements _included_ and _not included_ on uns_data

In [None]:
not_included = []
included = []
for row in range(len(uns_data)):
    try:
        model_anamet.wv.most_similar(uns_data['hasName'].iloc[row])
        included.append(uns_data['hasName'].iloc[row])
    except KeyError: 
        not_included.append(uns_data['hasName'].iloc[row])

included = pd.DataFrame(included)[0].unique()
not_included = pd.DataFrame(not_included)[0].unique()

print(included)
print(not_included)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()

elements_data_df['hasName_id'] = label_enc.fit_transform(elements_data['hasName'])
elements_data_df.head(2)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5)
kmeans.fit(elements_data_df.drop('hasName', axis=1))

In [None]:
yhat = kmeans.predict(elements_data_df.drop('hasName', axis=1))
elements_data_df['cluster'] = yhat
elements_data_df

In [None]:
for row in range(len(elements_data)):

    if elements_data_df['hasName'].iloc[row] in included:
        print(elements_data_df['hasName'].iloc[row], "->", elements_data_df['cluster'].iloc[row])

In [None]:
elements_dict = {}

for row in range(len(elements_data)):

    if elements_data_df['hasName'].iloc[row] in included:
        name = elements_data_df['hasName'].iloc[row][0]
        cluster = elements_data_df['cluster'].iloc[row]
        elements_dict[name] = cluster

elements_dict

similar_elements_dict = {}
for row in range(len(elements_data)):

    if elements_data_df['hasName'].iloc[row] in not_included:
        name = elements_data_df['hasName'].iloc[row][0]
        cluster = elements_data_df['cluster'].iloc[row]
        
        for key in elements_dict:

            if elements_dict[key] == cluster:
                similar_elements_dict[name] = key

In [None]:
similar_elements_dict