# Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  

from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns

## Read data

In [3]:
uns = pd.read_csv('uns_df.csv', index_col=0)
isri = pd.read_csv('isri_df.csv', index_col=0)
process = pd.read_csv('process_df.csv', index_col=0)
anamet = pd.read_csv('anamet_df.csv', index_col=0)
materials = pd.read_csv('materials_df.csv', index_col=0)

# New *NIKOS* data

In [4]:
xls = pd.ExcelFile('data/edited_nikos_data.xlsx')
xls.sheet_names

['ANAMET',
 'Anamet data',
 'ISRI',
 'Isri data',
 'UNS data',
 'Elements data',
 'Material data',
 'Process']

In [5]:
uns_data = pd.read_excel(xls, 'UNS data')

uns_data.rename(columns = {'sc:isComposedOf':'elements id'}, inplace = True)

elements_data = pd.read_excel(xls, 'Elements data')

uns_data = pd.merge(uns_data, elements_data, how='inner', on = 'elements id')
isri_data = pd.read_excel(xls, 'Isri data')
anamet_data = pd.read_excel(xls, 'Anamet data')
material_data = pd.read_excel(xls, 'Material data')
elements_data = pd.read_excel(xls, 'Elements data')

# 1. Predict missing feature values

In [6]:
isri_data.head(2)

Unnamed: 0,isri id,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasISRICode,sc:hasSpecification,pc:hasName,sc:isComposedOf,sc:isProcessedBy
0,sc:ISRI_0,Nonferrous Scrap - Red Metals,Wire,,Copper,Berry,"Bare, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257
1,sc:ISRI_1,Nonferrous Scrap - Red Metals,Wire and Cable,"Brittle Burnt Wire, Copper Tubing",Copper,Barley,"Clean, Untinned, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257


In [7]:
isri_data.shape

(244, 10)

Replace `sc:isComposedOf` with the matching material from `material_name`

In [8]:
# # Replace `sc:isComposedOf` with the matching material from `material_data`
# dict_materials = {}

# for row in range(len(material_data)):
#     name = material_data['Material ID'].iloc[row]
#     value = material_data['sc:hasName'].iloc[row]

#     dict_materials[name] = value

# isri_data.replace({"sc:isComposedOf": dict_materials},inplace=True)
# isri_data.head()

In [9]:
# Replace `sc:isComposedOf` with the matching material from `material_data`
dict_elements = {}

for row in range(len(elements_data)):
    name = elements_data['elements id'].iloc[row]
    value = elements_data['hasName'].iloc[row]

    dict_elements[name] = value

material_data.replace({"sc:contains": dict_elements},inplace=True)
material_data

Unnamed: 0,Material ID,sc:hasName,sc:contains
0,sc:Material_244,ALUMINUM,
1,sc:Material_245,MALLEABLE IRON,
2,sc:Material_246,ALUMINUM BRASS,
3,sc:Material_247,BRASS,
4,sc:Material_248,BRONZE,
...,...,...,...
244,sc:Material_283,,ALUMINUM
245,sc:Material_283,,COPPER
246,sc:Material_283,,LEAD
247,sc:Material_284,,COPPER


In [10]:
isri_data.shape

(244, 10)

In [11]:
isri_data.head()

Unnamed: 0,isri id,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasISRICode,sc:hasSpecification,pc:hasName,sc:isComposedOf,sc:isProcessedBy
0,sc:ISRI_0,Nonferrous Scrap - Red Metals,Wire,,Copper,Berry,"Bare, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257
1,sc:ISRI_1,Nonferrous Scrap - Red Metals,Wire and Cable,"Brittle Burnt Wire, Copper Tubing",Copper,Barley,"Clean, Untinned, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257
2,sc:ISRI_2,Nonferrous Scrap - Red Metals,"Miscellaneous, Wire","Excessively leaded, tinned, soldered copper wi...",Copper,Birch,Unalloyed,No. 2 Copper Wire,sc:Material_249,sc:Process_257
3,sc:ISRI_3,Nonferrous Scrap - Red Metals,"Clippings, punchings, bus bars, commutator seg...",,Copper,Candy,"clean, unalloyed, uncoated",No. 1 Heavy Copper Solids and Tubing,sc:Material_249,sc:Process_257
4,sc:ISRI_4,Nonferrous Scrap - Red Metals,Miscellaneous Scrap,"excessively leaded, tinned, soldered copper sc...",Copper,Cliff,Unalloyed,No. 2 Copper Solids and Tubing,sc:Material_249,sc:Process_257


How many __unique__ ISRI?

In [12]:
isri_data['isri id'].nunique()

244

In [13]:
# check nans
isri_data.isna().sum()

isri id                  0
sc:hasCategory           0
sc:hasForm              43
sc:isFreeFrom           87
sc:contains             68
sc:hasISRICode           0
sc:hasSpecification     12
pc:hasName               7
sc:isComposedOf        109
sc:isProcessedBy       194
dtype: int64

Όσα `ISRI` έχουν available `process`, ανήκουν στο ίδιο. Επομένως, δεν έχει νοήμα η προσπάθεια πρόβλεψης του process κάποιου scrap.

In [14]:
print(isri_data['sc:isProcessedBy'].unique())

# drop process from ISRI
isri_data.drop('sc:isProcessedBy', axis=1, inplace=True)

['sc:Process_257' nan]


In [15]:
isri_data['sc:hasCategory'].value_counts()

Nonferrous Scrap - Red Metals                                                                       50
Nonferrous Scrap - Aluminum                                                                         48
Ferrous Scrap                                                                                       30
Ferrous Scrap - Electric Furnace Casting and Foundry Grades                                         25
Nonferrous Scrap - Nickel/Stainless/Hi Temp                                                         19
Ferrous Scrap - Specially processed Grades to Meet Consumer Requirements - Cast Iron Grades         15
Nonferrous Scrap - Zinc                                                                             13
Ferrous scrap - Steel from Scrap Tires                                                              11
Nonferrous Scrap - Lead                                                                             10
Nonferrous Scrap - Mixed Metals                                          

In [16]:
isri_data

Unnamed: 0,isri id,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasISRICode,sc:hasSpecification,pc:hasName,sc:isComposedOf
0,sc:ISRI_0,Nonferrous Scrap - Red Metals,Wire,,Copper,Berry,"Bare, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249
1,sc:ISRI_1,Nonferrous Scrap - Red Metals,Wire and Cable,"Brittle Burnt Wire, Copper Tubing",Copper,Barley,"Clean, Untinned, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249
2,sc:ISRI_2,Nonferrous Scrap - Red Metals,"Miscellaneous, Wire","Excessively leaded, tinned, soldered copper wi...",Copper,Birch,Unalloyed,No. 2 Copper Wire,sc:Material_249
3,sc:ISRI_3,Nonferrous Scrap - Red Metals,"Clippings, punchings, bus bars, commutator seg...",,Copper,Candy,"clean, unalloyed, uncoated",No. 1 Heavy Copper Solids and Tubing,sc:Material_249
4,sc:ISRI_4,Nonferrous Scrap - Red Metals,Miscellaneous Scrap,"excessively leaded, tinned, soldered copper sc...",Copper,Cliff,Unalloyed,No. 2 Copper Solids and Tubing,sc:Material_249
...,...,...,...,...,...,...,...,...,...
239,sc:ISRI_239,Ferrous scrap - Steel from Scrap Tires,,,,278,Chopped. Less than two percent (<2%) rubber/fi...,Pulled Bead Wire (Ferrous) - Grade 1,
240,sc:ISRI_240,Ferrous scrap - Steel from Scrap Tires,,,,279,Chopped. Less than five percent (<5%) rubber/f...,Pulled Bead Wire (Ferrous) - Grade 2,
241,sc:ISRI_241,Ferrous scrap - Steel from Scrap Tires,,,,280,Chopped. Less than five percent (<5%) rubber/f...,Pulled Bead Wire (Ferrous) - Grade 3,
242,sc:ISRI_242,Ferrous scrap - Steel from Scrap Tires,,,,281,Chopped. Ten to twenty percent (10-20%) rubber...,Pulled Bead Wire (Ferrous) - Grade 4,


# Full merged `ISRI` dataset

In [17]:
isri_data.head()

Unnamed: 0,isri id,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasISRICode,sc:hasSpecification,pc:hasName,sc:isComposedOf
0,sc:ISRI_0,Nonferrous Scrap - Red Metals,Wire,,Copper,Berry,"Bare, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249
1,sc:ISRI_1,Nonferrous Scrap - Red Metals,Wire and Cable,"Brittle Burnt Wire, Copper Tubing",Copper,Barley,"Clean, Untinned, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249
2,sc:ISRI_2,Nonferrous Scrap - Red Metals,"Miscellaneous, Wire","Excessively leaded, tinned, soldered copper wi...",Copper,Birch,Unalloyed,No. 2 Copper Wire,sc:Material_249
3,sc:ISRI_3,Nonferrous Scrap - Red Metals,"Clippings, punchings, bus bars, commutator seg...",,Copper,Candy,"clean, unalloyed, uncoated",No. 1 Heavy Copper Solids and Tubing,sc:Material_249
4,sc:ISRI_4,Nonferrous Scrap - Red Metals,Miscellaneous Scrap,"excessively leaded, tinned, soldered copper sc...",Copper,Cliff,Unalloyed,No. 2 Copper Solids and Tubing,sc:Material_249


In [18]:
isri_data.shape

(244, 9)

In [19]:
isri_data.isna().sum()

isri id                  0
sc:hasCategory           0
sc:hasForm              43
sc:isFreeFrom           87
sc:contains             68
sc:hasISRICode           0
sc:hasSpecification     12
pc:hasName               7
sc:isComposedOf        109
dtype: int64

In [20]:
isri_data.fillna('zxcv', inplace=True)

In [21]:
isri_data.isna().sum()

isri id                0
sc:hasCategory         0
sc:hasForm             0
sc:isFreeFrom          0
sc:contains            0
sc:hasISRICode         0
sc:hasSpecification    0
pc:hasName             0
sc:isComposedOf        0
dtype: int64

## Cluster `isri_data['sc:hasCategory']` 

In [22]:
# Probably just a 'code name'. We will drop it
isri_data.drop('sc:hasISRICode', axis=1, inplace=True)

In [23]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# import other required libs
import pandas as pd
import numpy as np

# string manipulation libs
import re
import string
import nltk
from nltk.corpus import stopwords

In [24]:
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove numbers and special chars
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    if remove_stopwords:
        # 1. creates tokens
        tokens = nltk.word_tokenize(text)
        # 2. checks if token is a stopword and removes it
        tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
        # 3. joins all tokens again
        text = " ".join(tokens)
    # returns cleaned text
    text = text.lower().strip()
    return text

In [25]:
isri_data.head(2)

Unnamed: 0,isri id,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasSpecification,pc:hasName,sc:isComposedOf
0,sc:ISRI_0,Nonferrous Scrap - Red Metals,Wire,zxcv,Copper,"Bare, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249
1,sc:ISRI_1,Nonferrous Scrap - Red Metals,Wire and Cable,"Brittle Burnt Wire, Copper Tubing",Copper,"Clean, Untinned, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249


In [26]:
pred_isComposedOf_df = isri_data[isri_data['sc:isComposedOf'] != 'zxcv']
pred_isComposedOf_df.drop(columns=['isri id', 'sc:isFreeFrom'], inplace=True)
pred_isComposedOf_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_isComposedOf_df.drop(columns=['isri id', 'sc:isFreeFrom'], inplace=True)


Unnamed: 0,sc:hasCategory,sc:hasForm,sc:contains,sc:hasSpecification,pc:hasName,sc:isComposedOf
0,Nonferrous Scrap - Red Metals,Wire,Copper,"Bare, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249
1,Nonferrous Scrap - Red Metals,Wire and Cable,Copper,"Clean, Untinned, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249
2,Nonferrous Scrap - Red Metals,"Miscellaneous, Wire",Copper,Unalloyed,No. 2 Copper Wire,sc:Material_249
3,Nonferrous Scrap - Red Metals,"Clippings, punchings, bus bars, commutator seg...",Copper,"clean, unalloyed, uncoated",No. 1 Heavy Copper Solids and Tubing,sc:Material_249
4,Nonferrous Scrap - Red Metals,Miscellaneous Scrap,Copper,Unalloyed,No. 2 Copper Solids and Tubing,sc:Material_249


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

# columns = ['sc:hasCategory', 'pc:hasName']
columns = pred_isComposedOf_df.columns

for col_name in columns:
    
    if col_name == 'sc:isComposedOf': continue
    else:
            
        text = isri_data[col_name]

        df = pd.DataFrame()
        df['corpus'] = text
        df['cleaned'] = df['corpus'].apply(lambda x: preprocess_text(x, remove_stopwords=True))

        pred_isComposedOf_df[col_name] = df['cleaned']

        del df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_isComposedOf_df[col_name] = df['cleaned']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_isComposedOf_df[col_name] = df['cleaned']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_isComposedOf_df[col_name] = df['cleaned']
A value is trying to be set on a copy of a slice from a DataFram

In [28]:
pred_isComposedOf_df

Unnamed: 0,sc:hasCategory,sc:hasForm,sc:contains,sc:hasSpecification,pc:hasName,sc:isComposedOf
0,nonferrous scrap red metals,wire,copper,bare uncoated unalloyed,copper wire,sc:Material_249
1,nonferrous scrap red metals,wire cable,copper,clean untinned uncoated unalloyed,copper wire,sc:Material_249
2,nonferrous scrap red metals,miscellaneous wire,copper,unalloyed,copper wire,sc:Material_249
3,nonferrous scrap red metals,clippings punchings bus bars commutator segmen...,copper,clean unalloyed uncoated,heavy copper solids tubing,sc:Material_249
4,nonferrous scrap red metals,miscellaneous scrap,copper,unalloyed,copper solids tubing,sc:Material_249
...,...,...,...,...,...,...
228,ferrous scrap specially processed grades meet ...,borings drillings,cast malleable iron,containing percent oil,chemical borings,sc:Material_245
229,ferrous scrap specially processed grades meet ...,borings,cast iron,heated briquetted density approximately percen...,briquetted cast iron borings hot process,sc:Material_697
230,ferrous scrap specially processed grades meet ...,boring briquettes,cast iron,hydraulically compressed cohesive solid densit...,briquetted cast iron borings cold process,sc:Material_697
231,ferrous scrap specially processed grades meet ...,borings drillings,malleable iron,clean,malleable borings,sc:Material_245


In [29]:
def cluster_text(text, true_k):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text)

    return X

   #  Sum_of_squared_distances = []
   #  K = range(2,10)
   #  for k in K:
   #     km = KMeans(n_clusters=k, max_iter=200, n_init=10)
   #     km = km.fit(X)
   #     Sum_of_squared_distances.append(km.inertia_)
   #  plt.plot(K, Sum_of_squared_distances, 'bx-')
   #  plt.xlabel('k')
   #  plt.ylabel('Sum_of_squared_distances')
   #  plt.title('Elbow Method For Optimal k')
   #  plt.show()

   # #  true_k = 6
   #  model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=10)
   #  model.fit(X)

   #  labels=model.labels_
   #  clusters=pd.DataFrame(list(zip(text,labels)),columns=['title','cluster'])
   #  #print(clusters.sort_values(by=['cluster']))
       
   #  return clusters

In [30]:
X = cluster_text(pred_isComposedOf_df['sc:hasCategory'], 6)
X.shape

(135, 22)

In [32]:
for mat in pred_isComposedOf_df['sc:isComposedOf'].value_counts().tail(15).index.values:

    pred_isComposedOf_df = pred_isComposedOf_df[pred_isComposedOf_df['sc:isComposedOf'] != mat]

In [61]:
from sklearn.model_selection import train_test_split

X = pred_isComposedOf_df.drop('sc:isComposedOf', axis=1)
y = pred_isComposedOf_df['sc:isComposedOf']

X = cluster_text(X['sc:hasCategory'], 6)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

<113x22 sparse matrix of type '<class 'numpy.float64'>'
	with 481 stored elements in Compressed Sparse Row format>

In [67]:
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [68]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)
y_hat = model.predict(X_test)


In [69]:
results = pd.DataFrame()

results['true'] = y_test
results['pred'] = y_hat

results

Unnamed: 0,true,pred
0,4,3
1,7,7
2,11,11
3,3,3
4,4,3
5,9,9
6,9,10
7,6,3
8,4,3
9,7,7


In [70]:
accuracy_score(y_test, y_hat)

0.5217391304347826

In [None]:
isri_data['cluster_category'] = cluster_text(isri_data['sc:hasCategory'], 6)['cluster']
isri_data.head(2)

In [None]:
isri_data['cluster_hasName'] = cluster_text(isri_data['pc:hasName'], 5)['cluster']
isri_data.head(2)

In [None]:
 isri_data[isri_data['cluster_hasName'] == 1]

Keep only `anamets` with english description

In [None]:
# Keep only anamets with english sc:descritpion
import unicodedata as ud

latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha()) # isalpha suggested by John Machin

e_description_index = []
for row in range(len(anamet_data)):
    if only_roman_chars(str(anamet_data['sc:description'].iloc[row])):
        e_description_index.append(row)
        
anamet_data = anamet_data.iloc[e_description_index]

In [None]:
uns_data.head(2)

In [None]:
isri_data.head(2)

In [None]:
anamet_data.head(2)

In [None]:
print(uns_data.isna().sum(), '\n')
print(isri_data.isna().sum(), '\n')
print(anamet_data.isna().sum(), '\n')

In [None]:
anamet_data['sc:hasCategory'].value_counts()

In [None]:
isri_data

In [None]:
isri_data['sc:isProcessedBy'].value_counts()

In [None]:
isri_for_ml = isri_data.drop(columns=['isri id', 'pc:hasName', 'sc:isProcessedBy'])

# NLP

In [None]:
from gensim.utils import simple_preprocess

Keep only `anamets` with english description

## ANAMET `anamet_data`

In [None]:
# Keep only anamets with english sc:descritpion
import unicodedata as ud

latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha()) # isalpha suggested by John Machin

e_description_index = []
for row in range(len(anamet_data)):
    if only_roman_chars(str(anamet_data['sc:description'].iloc[row])):
        e_description_index.append(row)
        
anamet_data = anamet_data.iloc[e_description_index]
anamet_data

Preprocess text data using `gensim.utils.simple_preprocess()`

In [None]:
anamet_data['sc:description'] = anamet_data['sc:description'].apply(simple_preprocess)
anamet_data.head(2)

Delete word `scrap` and `for` from description

In [None]:
# Delete word scrap
for i in range(len(anamet_data)):
    if 'scrap' in anamet_data['sc:description'].iloc[i]:
        anamet_data['sc:description'].iloc[i].remove('scrap')
    if 'for' in anamet_data['sc:description'].iloc[i]:
        anamet_data['sc:description'].iloc[i].remove('for')

In [None]:
anamet_data.head(2)

## UNS `uns_data`

In [None]:
uns_data['hasName'] = uns_data['hasName'].apply(simple_preprocess)
uns_data.head(2)

Θέλουμε να συσχετίσουμε τα `anamet` με τα `uns`. Οι μόνες πληροφορίες που έχουμε (features) είναι το `sc:description` και `hasName`. Θα εφαρμόσουμε NLP και ειδικότερα Word2Vec για ομοιότητες μεταξύ των texts.

In [None]:
print(anamet_data['anamet entity'].nunique(), "unique anamet ids")
print(uns_data['uns id'].nunique(), "unique uns ids")

## ISRI `isri_data`

In [None]:
isri_data.head(2)

In [None]:
isri_data.drop(columns=['sc:isComposedOf', 'sc:isProcessedBy'], inplace=True)

In [None]:
isri_data.isna().sum()


In [None]:
isri_data.fillna('nan', inplace=True)
isri_data.isna().sum()

In [None]:
isri_data['sc:hasCategory'] = isri_data['sc:hasCategory'].apply(simple_preprocess)
isri_data['sc:hasForm'] = isri_data['sc:hasForm'].apply(simple_preprocess)
isri_data['sc:isFreeFrom'] = isri_data['sc:isFreeFrom'].apply(simple_preprocess)
isri_data['sc:contains'] = isri_data['sc:contains'].apply(simple_preprocess)
isri_data['sc:hasISRICode'] = isri_data['sc:hasISRICode'].astype(str).apply(simple_preprocess)
isri_data['sc:hasSpecification'] = isri_data['sc:hasSpecification'].apply(simple_preprocess)
isri_data['pc:hasName'] = isri_data['pc:hasName'].apply(simple_preprocess)

In [None]:
isri_data.head(3)

Delete word `scrap` and `for` from description

In [None]:
# Delete word scrap
for i in range(len(isri_data)):
    if 'scrap' in isri_data['sc:hasCategory'].iloc[i]:
        
        isri_data['sc:hasCategory'].iloc[i].remove('scrap')

### Try Word2Vec stuff

In [None]:
import gensim
import gensim.downloader

# for model_name in list(gensim.downloader.info()['models'].keys()):
    # print(model_name)
    
model_pretrained = gensim.downloader.load('glove-twitter-25')
print(model_pretrained)

In [None]:
# NLP preprocess
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

description = anamet_data['sc:description']
description

model_anamet = Word2Vec(min_count=1, 
                 vector_size=25, 
                 workers=4, 
                 window=2, 
                 sg=1)

model_anamet.build_vocab(description)
model_anamet.train(description, total_examples=model_anamet.corpus_count,
           epochs=500)
print(model_anamet)

#### Test example

In [None]:
test_anamet = anamet_data.iloc[120]
print(test_anamet)
print('\n')
print(test_anamet['sc:description'])

In [None]:
uns_data

In [None]:
for row in range(len(uns_data)):
    w1 = uns_data['hasName'].iloc[row][0]
    w2 = 'copper'
    try:
        sim = model_anamet.wv.similarity(w1, w2)
    except KeyError:
        sim = model_pretrained.similarity(w1, w2)
    if sim >= .7: print(w1, w2, sim, uns_data['uns id'].iloc[row])

In [None]:
model_anamet.wv.similarity('tin', test_anamet['sc:description'][0])

In [None]:
model_pretrained.similarity('tin', test_anamet['sc:description'][0])

In [None]:
anamet_data['sc:description']

In [None]:
for row in range(200, 250):
    w1 = uns_data['hasName'].iloc[row][0]
    
    for j in range(3):
        try:
            w2 = anamet_data['sc:description'].iloc[j]
            print(w1, w2[0], model.similarity(w1, w2[0]))
        except KeyError:
            continue

# Clustering chemical elements

In [None]:
elements_data_df = elements_data[['ar', 'atomicNumber', 'density', 'meltingPoint', 'hasName']]
elements_data_df['hasName'] = elements_data_df['hasName'].apply(simple_preprocess)
elements_data_df.head(2)

In [None]:
print(elements_data.shape)
print(elements_data['hasName'].nunique())

Elements _included_ and _not included_ on uns_data

In [None]:
not_included = []
included = []
for row in range(len(uns_data)):
    try:
        model_anamet.wv.most_similar(uns_data['hasName'].iloc[row])
        included.append(uns_data['hasName'].iloc[row])
    except KeyError: 
        not_included.append(uns_data['hasName'].iloc[row])

included = pd.DataFrame(included)[0].unique()
not_included = pd.DataFrame(not_included)[0].unique()

print(included)
print(not_included)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()

elements_data_df['hasName_id'] = label_enc.fit_transform(elements_data['hasName'])
elements_data_df.head(2)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5)
kmeans.fit(elements_data_df.drop('hasName', axis=1))

In [None]:
yhat = kmeans.predict(elements_data_df.drop('hasName', axis=1))
elements_data_df['cluster'] = yhat
elements_data_df

In [None]:
for row in range(len(elements_data)):

    if elements_data_df['hasName'].iloc[row] in included:
        print(elements_data_df['hasName'].iloc[row], "->", elements_data_df['cluster'].iloc[row])

In [None]:
elements_dict = {}

for row in range(len(elements_data)):

    if elements_data_df['hasName'].iloc[row] in included:
        name = elements_data_df['hasName'].iloc[row][0]
        cluster = elements_data_df['cluster'].iloc[row]
        elements_dict[name] = cluster

elements_dict

similar_elements_dict = {}
for row in range(len(elements_data)):

    if elements_data_df['hasName'].iloc[row] in not_included:
        name = elements_data_df['hasName'].iloc[row][0]
        cluster = elements_data_df['cluster'].iloc[row]
        
        for key in elements_dict:

            if elements_dict[key] == cluster:
                similar_elements_dict[name] = key

In [None]:
similar_elements_dict