# Import libraries

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

## Read data

In [2]:
uns = pd.read_csv('uns_df.csv', index_col=0)
isri = pd.read_csv('isri_df.csv', index_col=0)
process = pd.read_csv('process_df.csv', index_col=0)
anamet = pd.read_csv('anamet_df.csv', index_col=0)
materials = pd.read_csv('materials_df.csv', index_col=0)

# New *NIKOS* data

In [8]:
xls = pd.ExcelFile('data/edited_nikos_data.xlsx')

In [10]:
uns_data = pd.read_excel(xls, 'UNS data')


In [11]:
uns_data.rename(columns = {'sc:isComposedOf':'elements id'}, inplace = True)

In [12]:
elements_data = pd.read_excel(xls, 'Elements data')

In [13]:
uns_data = pd.merge(uns_data, elements_data, how='inner', on = 'elements id')

In [14]:
isri_data = pd.read_excel(xls, 'Isri data')

In [15]:
anamet_data = pd.read_excel(xls, 'Anamet data')

# NLP

In [16]:
from gensim.utils import simple_preprocess

In [19]:
# Keep only anamets with english sc:descritpion
import unicodedata as ud

latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha()) # isalpha suggested by John Machin

e_description_index = []
for row in range(len(anamet_data)):
    if only_roman_chars(str(anamet_data['sc:description'].iloc[row])):
        e_description_index.append(row)
        
anamet_data = anamet_data.iloc[e_description_index]
anamet_data

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
28,sc:ANAMET_465,COPPER SCRAP MILLBERRY CABLES FOR STRIP.,Non-Ferrous,40014,
30,sc:ANAMET_466,COPPER SCRAP BIRCH CABLES FOR STRIPPING,Non-Ferrous,40015,
33,sc:ANAMET_467,MIXED COPPER CABLES SCRAP( RAW ),Non-Ferrous,40091,
34,sc:ANAMET_468,CABLES COPPER - LEAD(RAW),Non-Ferrous,40133,
37,sc:ANAMET_469,COPPER SCRAP BERRY/CANDY AS PER ISR(RAW),Non-Ferrous,40210,
...,...,...,...,...,...
317,sc:ANAMET_684,LEAD SCRAP (E),Non-Ferrous,50185,
318,sc:ANAMET_685,ELECTRIC MOTORS SHREDDED (WEEE),Non-Ferrous,50259,
319,sc:ANAMET_686,BLACK PLASTIC SCRAP (WEEE) ( E ),Non-Ferrous,50117,
320,sc:ANAMET_687,SCRAP PLEXIGLASS ( E ),Non-Ferrous,50118,


Preprocess text data using `gensim.utils.simple_preprocess()`

In [26]:
anamet_data['sc:description'] = anamet_data['sc:description'].apply(simple_preprocess)
anamet_data.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anamet_data['sc:description'] = anamet_data['sc:description'].apply(simple_preprocess)


Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
28,sc:ANAMET_465,"[copper, scrap, millberry, cables, for, strip]",Non-Ferrous,40014,
30,sc:ANAMET_466,"[copper, scrap, birch, cables, for, stripping]",Non-Ferrous,40015,


In [29]:
# Delete word scrap
for i in range(len(anamet_data)):
    if 'scrap' in anamet_data['sc:description'].iloc[i]:
        anamet_data['sc:description'].iloc[i].remove('scrap')

In [30]:
anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
28,sc:ANAMET_465,"[copper, millberry, cables, for, strip]",Non-Ferrous,40014,
30,sc:ANAMET_466,"[copper, birch, cables, for, stripping]",Non-Ferrous,40015,


# Anamet

In [None]:
anamet.head()

In [None]:
anamet.shape

How many __unique__ ANAMET types exist?

In [None]:
anamet['name_src'].nunique()

In [None]:
anamet['full_community_id_src'].nunique()

ANAMET Relationships

In [None]:
anamet['Relationship'].value_counts()

In [None]:
anamet[anamet['Relationship'] == 'IS_PROCESSED_WITH']['name_trg'].value_counts()

Όλα περνάνε και από τα 3 processes.

# ISRI

In [None]:
isri.head()

# UNS

In [None]:
uns.head()

In [None]:
uns.shape

In [None]:
uns['Relationship'].value_counts()

In [None]:
print(uns['name_src'].nunique())

# Clustering

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
def label_encoding(df):
    for col in df.columns:
        if df[col].dtypes == 'object':
            labelencoder = LabelEncoder()
            df[col] = labelencoder.fit_transform(df[col])        
            del labelencoder
    return df

In [None]:
uns_copy = uns.copy()

In [None]:
uns = label_encoding(uns)

In [None]:
uns.isna().sum()

In [None]:
uns = uns.dropna()

In [None]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=5).fit(uns)
neigh_dist, neigh_ind = nbrs.kneighbors(uns)
sort_neigh_dist = np.sort(neigh_dist, axis=0)

In [None]:
import matplotlib.pyplot as plt
k_dist = sort_neigh_dist[:, 4]
plt.plot(k_dist)
plt.axhline(y=2.5, linewidth=1, linestyle='dashed', color='k')
plt.ylabel("k-NN distance")
plt.xlabel("Sorted observations (4th NN)")
plt.show()

In [None]:
from sklearn.cluster import DBSCAN

clusters = DBSCAN(eps=22, min_samples=100).fit(uns)
clusters.labels_
set(clusters.labels_)

In [None]:
uns_copy = uns_copy.dropna()
uns_copy['cluster'] = clusters.labels_

In [None]:
uns_copy[uns_copy['cluster'] == 6]

In [None]:
uns

In [None]:
!pip install --upgrade gensim

# New *NIKOS* data

In [None]:
xls = pd.ExcelFile('data/edited_nikos_data.xlsx')
xls.sheet_names

In [None]:
uns_data = pd.read_excel(xls, 'UNS data')
uns_data.head()

In [None]:
uns_data.rename(columns = {'sc:isComposedOf':'elements id'}, inplace = True)

In [None]:
elements_data = pd.read_excel(xls, 'Elements data')
elements_data.head()

In [None]:
uns_data = pd.merge(uns_data, elements_data, how='inner', on = 'elements id')
uns_data

In [None]:
isri_data = pd.read_excel(xls, 'Isri data')
isri_data.head()

In [None]:
anamet_data = pd.read_excel(xls, 'Anamet data')
anamet_data.head()

Keep only anamets with english description

In [None]:
english_anamet_data['anamet entity'].nunique()

In [None]:
english_anamet_data['sc:hasCategory'].value_counts()

In [None]:
english_anamet_data['sc:isProcessedBy'].value_counts()

In [None]:
anamet_data['sc:hasCategory'].value_countsnts()

In [None]:
# Func to count number of common words between two strings
def num_of_common_words(s1, s2):
    s1 = s1.lower()
    s2 = s2.lower()
    s1List = s1.split(" ")
    s2List = s2.split(" ")
    return len(list(set(s1List) & set(s2List)))

In [None]:
num_of_common_words(english_anamet_data['sc:description'].iloc[0], 
                    english_anamet_data['sc:description'].iloc[1])

In [None]:
isri_data

In [None]:
df1 = pd.DataFrame()
df1['anamet'] = english_anamet_data['sc:description']
df1['uns'] = uns_data['hasName']
df1.head(2)

In [None]:
df2 = df1.apply(lambda x: ','.join(x.astype(str)), axis=1)
df_clean = pd.DataFrame({'clean': df2})

In [None]:
# Create the list of list format of the custom corpus for gensim modeling 
sent = [row.split(',') for row in df_clean['clean']]

In [None]:
sent[:2]

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(sent, min_count=1, vector_size=50, 
                 workers=3, window=3, sg=1)

In [None]:
model.train(sent, total_examples=100, epochs=10)

In [None]:
model.wv.similarity(sent[1][0], sent[1][1])

In [None]:
english_anamet_data['sc:description'].iloc[0]

### Word2Vec for similar scraps

In [None]:
nlp_anamet = english_anamet_data.copy()

# NLP preprocess
from gensim.utils import simple_preprocess
description = nlp_anamet['sc:description'].apply(simple_preprocess)
description

In [None]:
for i in range(len(description)):
    if 'scrap' in description.iloc[i]:
        description.iloc[i].remove('scrap')

In [None]:
description

In [None]:
model = Word2Vec(min_count=1, 
                 vector_size=50, 
                 workers=4, 
                 window=1, 
                 sg=1)

In [None]:
model.build_vocab(description, progress_per=10)

In [None]:
model.epochs

In [None]:
model.train(description, total_examples=model.corpus_count,
           epochs=100)

In [None]:
model.wv.most_similar("zinc")

In [None]:
nlp_anamet