# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  
import scipy

from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Data Loading

In [2]:
xls = pd.ExcelFile('data/edited_nikos_data.xlsx')
xls.sheet_names

['ANAMET',
 'Anamet data',
 'ISRI',
 'Isri data',
 'UNS data',
 'Elements data',
 'Material data',
 'Process']

In [3]:
uns_data = pd.read_excel(xls, 'UNS data')

uns_data.rename(columns = {'sc:isComposedOf':'elements id'}, inplace = True)

elements_data = pd.read_excel(xls, 'Elements data')

uns_data = pd.merge(uns_data, elements_data, how='inner', on = 'elements id')
isri_data = pd.read_excel(xls, 'Isri data')
anamet_data = pd.read_excel(xls, 'Anamet data')
material_data = pd.read_excel(xls, 'Material data')
elements_data = pd.read_excel(xls, 'Elements data')

# Raw data preprocessing

In [4]:
isri_data.head(2)

Unnamed: 0,isri id,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasISRICode,sc:hasSpecification,pc:hasName,sc:isComposedOf,sc:isProcessedBy
0,sc:ISRI_0,Nonferrous Scrap - Red Metals,Wire,,Copper,Berry,"Bare, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257
1,sc:ISRI_1,Nonferrous Scrap - Red Metals,Wire and Cable,"Brittle Burnt Wire, Copper Tubing",Copper,Barley,"Clean, Untinned, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257


In [5]:
isri_data.shape

(244, 10)

In [6]:
isri_data.isna().sum()

isri id                  0
sc:hasCategory           0
sc:hasForm              43
sc:isFreeFrom           87
sc:contains             68
sc:hasISRICode           0
sc:hasSpecification     12
pc:hasName               7
sc:isComposedOf        109
sc:isProcessedBy       194
dtype: int64

#### Drop unusefull columns

In [7]:
df = isri_data.copy()
df.drop(columns=['isri id', 'sc:hasISRICode', 'sc:isProcessedBy'], inplace=True)
df.shape

(244, 7)

In [8]:
import texthero as hero
from texthero import preprocessing

custom_pipeline = [preprocessing.fillna,
                   preprocessing.lowercase,
                   preprocessing.remove_whitespace,
                   preprocessing.remove_stopwords]

for col in df.drop('sc:isComposedOf',axis=1).columns:
    df[col] = hero.clean(df[col])

df.head()


Unnamed: 0,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasSpecification,pc:hasName,sc:isComposedOf
0,nonferrous scrap red metals,wire,,copper,bare uncoated unalloyed,copper wire,sc:Material_249
1,nonferrous scrap red metals,wire cable,brittle burnt wire copper tubing,copper,clean untinned uncoated unalloyed,copper wire,sc:Material_249
2,nonferrous scrap red metals,miscellaneous wire,excessively leaded tinned soldered copper wire...,copper,unalloyed,copper wire,sc:Material_249
3,nonferrous scrap red metals,clippings punchings bus bars commutator segmen...,,copper,clean unalloyed uncoated,heavy copper solids tubing,sc:Material_249
4,nonferrous scrap red metals,miscellaneous scrap,excessively leaded tinned soldered copper scra...,copper,unalloyed,copper solids tubing,sc:Material_249


In [9]:
df.isna().sum()

sc:hasCategory           0
sc:hasForm               0
sc:isFreeFrom            0
sc:contains              0
sc:hasSpecification      0
pc:hasName               0
sc:isComposedOf        109
dtype: int64

## Target column: `sc:isComposedOf`

### TfIdfVectorizer

In [10]:
df = df[df['sc:isComposedOf'].notna()]
df.shape

(135, 7)

In [11]:
df.head(1)

Unnamed: 0,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasSpecification,pc:hasName,sc:isComposedOf
0,nonferrous scrap red metals,wire,,copper,bare uncoated unalloyed,copper wire,sc:Material_249


In [14]:
df['corpus'] = df['sc:hasSpecification'] + ' ' + df['sc:hasForm'] + ' ' + df['sc:contains'] + ' ' + df['pc:hasName']

In [15]:
df.head(1)

Unnamed: 0,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasSpecification,pc:hasName,sc:isComposedOf,corpus
0,nonferrous scrap red metals,wire,,copper,bare uncoated unalloyed,copper wire,sc:Material_249,bare uncoated unalloyed wire copper copper wire


#### Split into train and test set

In [16]:
from sklearn.model_selection import train_test_split

X = df['corpus']
y = df['sc:isComposedOf']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=4)

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

y_train = y_train.values
y_test = y_test.values

vectorizer = TfidfVectorizer(min_df=2)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

print(train_vectors.shape, test_vectors.shape)

(108, 240) (27, 240)


In [17]:
X.head()

0      bare uncoated unalloyed wire copper copper wire
1    clean untinned uncoated unalloyed wire cable c...
2      unalloyed miscellaneous wire copper copper wire
3    clean unalloyed uncoated clippings punchings b...
4    unalloyed miscellaneous scrap copper copper so...
Name: corpus, dtype: object

####  Logistic Regression

In [18]:
model = LogisticRegression()

model.fit(train_vectors, y_train)
predicted = model.predict(test_vectors)
print(accuracy_score(y_test,predicted))

results = pd.DataFrame(columns=['true', 'pred'])
results['true'] = y_test
results['pred'] = predicted

0.5925925925925926


#### SVC

Using `probability=True` may be inconsistent with `predict`

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {'kernel': ('linear', 'rbf') , 
              'C':[5, 10, 12, 15, 20, 25],
              'gamma': [1,0.1,0.2, 0.3, 0.5, 0.01,0.001]
             }
grid = GridSearchCV(SVC(probability=True, random_state=42) , param_grid , refit=True , verbose=0, cv=5)
grid.fit(train_vectors, y_train)
print(grid.best_estimator_)

model_svc = grid.best_estimator_
predicted = model_svc.predict(test_vectors)
print(accuracy_score(y_test,predicted))



SVC(C=5, gamma=1, kernel='linear', probability=True, random_state=42)
0.8518518518518519


#### KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
model.fit(train_vectors, y_train)
predicted = model.predict(test_vectors)
print(accuracy_score(y_test, predicted))

n_neighbors = list(range(1,20))
p=[1,2]#Convert to dictionary
param_grid = dict(n_neighbors=n_neighbors, p=p)

grid = GridSearchCV(model , param_grid , refit=True , verbose=0, cv=3)
grid.fit(train_vectors, y_train)
print(grid.best_estimator_)

model_knn = grid.best_estimator_
predicted = model.predict(test_vectors)
print(accuracy_score(y_test,predicted))

0.8888888888888888




KNeighborsClassifier(n_neighbors=1)
0.8888888888888888


#### Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest

model = RandomForestClassifier()

model.fit(train_vectors, y_train)
predicted = model.predict(test_vectors)
print(accuracy_score(y_test,predicted))

n_estimators = [int(x) for x in np.linspace(start = 3, stop = 60, num = 5)]
max_depth = [int(x) for x in np.linspace(2, 10, num = 1)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]# Create the random grid

param_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

grid = RandomizedSearchCV(model , param_grid , refit=True , verbose=0, cv=3, n_iter=20)
grid.fit(train_vectors, y_train)
print(grid.best_estimator_)

model_rf = grid.best_estimator_
predicted = model_rf.predict(test_vectors)
print(accuracy_score(y_test,predicted))

0.8148148148148148




RandomForestClassifier(n_estimators=45)
0.8148148148148148


#### Gaussian Process Classifier

In [22]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel

model = GaussianProcessClassifier()
model.fit(train_vectors.todense(), y_train)
predicted = model.predict(test_vectors.todense())
print(accuracy_score(y_test,predicted))

grid = dict()
grid['kernel'] = [1*RBF(), 1*DotProduct(), 1*Matern(),  1*RationalQuadratic(), 1*WhiteKernel()]
# define search
grid = GridSearchCV(model, grid, scoring='accuracy', cv=3, n_jobs=-1)
grid.fit(train_vectors.todense(), y_train)

model = grid.best_estimator_
predicted = model.predict(test_vectors.todense())
print(accuracy_score(y_test,predicted))



0.4444444444444444






0.9259259259259259




In [23]:
model_gp=grid.best_estimator_
model_gp

In [24]:
predicted = model_gp.predict(test_vectors.todense())
print(accuracy_score(y_test,predicted))

0.9259259259259259




#### Train best model on all `ISRI` dataset

In [25]:
vectors = vectorizer.transform(X)
print(vectors.shape)

(135, 240)


In [26]:
model_gp.fit(vectors.todense(), y)



In [27]:
predicted = model_gp.predict(test_vectors.todense())
print(accuracy_score(y_test,predicted))



0.9629629629629629


In [28]:
model_svc.fit(vectors, y)
predicted = model_svc.predict(test_vectors.todense())
print(accuracy_score(y_test,predicted))

1.0




The high accuracy is because we using the model to predict a part of the training set.

We will try to predict the material that `ANAMET` scraps are composed using the models trained on `ISRI` dataset

In [29]:
model_knn.fit(vectors, y)
predicted = model_knn.predict(test_vectors.todense())
print(accuracy_score(y_test,predicted))

1.0




In [30]:
model_rf.fit(vectors, y)
predicted = model_rf.predict(test_vectors.todense())
print(accuracy_score(y_test,predicted))

1.0




#### Keep only `ANAMET` with english `sc:description`

In [31]:
anamet_data = pd.read_excel(xls, 'Anamet data')

In [32]:
import unicodedata as ud

latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha()) # isalpha suggested by John Machin

e_description_index = []
for row in range(len(anamet_data)):
    if only_roman_chars(str(anamet_data['sc:description'].iloc[row])):
        e_description_index.append(row)
        
anamet_data = anamet_data.iloc[e_description_index]
anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
28,sc:ANAMET_465,COPPER SCRAP MILLBERRY CABLES FOR STRIP.,Non-Ferrous,40014,
30,sc:ANAMET_466,COPPER SCRAP BIRCH CABLES FOR STRIPPING,Non-Ferrous,40015,


In [33]:
anamet_data.shape

(214, 5)

In [34]:
anamet_data = anamet_data[['anamet entity', 'sc:description', 'sc:hasCategory']]

anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory
28,sc:ANAMET_465,COPPER SCRAP MILLBERRY CABLES FOR STRIP.,Non-Ferrous
30,sc:ANAMET_466,COPPER SCRAP BIRCH CABLES FOR STRIPPING,Non-Ferrous


In [35]:
anamet_data.isna().sum()

anamet entity     0
sc:description    0
sc:hasCategory    0
dtype: int64

In [36]:
for col in anamet_data.drop('anamet entity', axis=1).columns:
    anamet_data[col] = hero.clean(anamet_data[col])

anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory
28,sc:ANAMET_465,copper scrap millberry cables strip,non ferrous
30,sc:ANAMET_466,copper scrap birch cables stripping,non ferrous


In [37]:
anamet_data_X = pd.DataFrame()
anamet_data_X['corpus'] = anamet_data['sc:description'] + ' ' + anamet_data['sc:hasCategory']
# anamet_data_X['corpus'] = anamet_data['sc:description']

In [38]:
X = anamet_data_X['corpus']

anamet_train_vectors = vectorizer.transform(X)

print(anamet_train_vectors.shape)

(214, 240)


In [39]:
model = model_svc
y_hat = model.predict(anamet_train_vectors)

In [40]:
anamet_data['predicted material'] = y_hat

pd.set_option('display.max_rows', 300)
anamet_data.head()

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,predicted material
28,sc:ANAMET_465,copper scrap millberry cables strip,non ferrous,sc:Material_249
30,sc:ANAMET_466,copper scrap birch cables stripping,non ferrous,sc:Material_249
33,sc:ANAMET_467,mixed copper cables scrap raw,non ferrous,sc:Material_249
34,sc:ANAMET_468,cables copper lead raw,non ferrous,sc:Material_249
37,sc:ANAMET_469,copper scrap berry candy per isr raw,non ferrous,sc:Material_249


In [41]:
pred_proba = pd.DataFrame(model.predict_proba(anamet_train_vectors))
pred_proba.columns = model_svc.classes_

res = pred_proba.apply(lambda x: pd.Series(np.concatenate([x.nlargest(3).index.values, x.nlargest(3)])), axis=1)
res.head()

Unnamed: 0,0,1,2,3,4,5
0,sc:Material_249,sc:Material_695,sc:Material_244,0.244001,0.130386,0.070438
1,sc:Material_249,sc:Material_695,sc:Material_244,0.244001,0.130386,0.070438
2,sc:Material_249,sc:Material_244,sc:Material_695,0.172134,0.127243,0.122255
3,sc:Material_249,sc:Material_244,sc:Material_695,0.183803,0.10158,0.077002
4,sc:Material_249,sc:Material_695,sc:Material_244,0.244001,0.130386,0.070438


In [42]:
result = pd.concat([anamet_data, res], axis=1, join='inner')
display(result.head(2))

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,predicted material,0,1,2,3,4,5
28,sc:ANAMET_465,copper scrap millberry cables strip,non ferrous,sc:Material_249,sc:Material_249,sc:Material_695,sc:Material_244,0.244001,0.130386,0.070438
30,sc:ANAMET_466,copper scrap birch cables stripping,non ferrous,sc:Material_249,sc:Material_249,sc:Material_695,sc:Material_244,0.244001,0.130386,0.070438


In [43]:
with pd.ExcelWriter('results/predict_material.xlsx') as writer:  
    result.to_excel(writer, sheet_name='SVC')

#### Gaussian Process Model

In [44]:
model = model_gp
y_hat = model.predict(anamet_train_vectors.todense())



In [45]:
anamet_data['predicted material'] = y_hat

pd.set_option('display.max_rows', 300)
anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,predicted material
28,sc:ANAMET_465,copper scrap millberry cables strip,non ferrous,sc:Material_249
30,sc:ANAMET_466,copper scrap birch cables stripping,non ferrous,sc:Material_249


In [46]:
pred_proba = pd.DataFrame(model.predict_proba(anamet_train_vectors.todense()))
pred_proba.columns = model.classes_

res = pred_proba.apply(lambda x: pd.Series(np.concatenate([x.nlargest(3).index.values, x.nlargest(3)])), axis=1)
res.head(2)



Unnamed: 0,0,1,2,3,4,5
0,sc:Material_249,sc:Material_695,sc:Material_250,0.137387,0.068485,0.04845
1,sc:Material_249,sc:Material_695,sc:Material_250,0.137387,0.068485,0.04845


In [47]:
result = pd.concat([anamet_data, res], axis=1, join='inner')
display(result.head(2))

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,predicted material,0,1,2,3,4,5
28,sc:ANAMET_465,copper scrap millberry cables strip,non ferrous,sc:Material_249,sc:Material_249,sc:Material_695,sc:Material_250,0.137387,0.068485,0.04845
30,sc:ANAMET_466,copper scrap birch cables stripping,non ferrous,sc:Material_249,sc:Material_249,sc:Material_695,sc:Material_250,0.137387,0.068485,0.04845


In [48]:
with pd.ExcelWriter('results/predict_material.xlsx', mode='a') as writer:  
    result.to_excel(writer, sheet_name='GP')

# Test models using scrap descriptions from Internet

In [49]:
s = ['Shredded demolition windows profiles | 5-20 cm, around 1% attachment of thermal break (plastic). Are not passed by X-ray treatment.',
     'Open Extinguisher with no heads | 6061 grade',
     'I sell structural steel oxycut both in skeleton (6 x 2.5 m. plates) and some cut parts. Hot-rolled non-alloy carbon steel cleaned of paints and greases. Dry product.',
     'Regulary purchase of galvalume dross, capacity 300-500mt/month',
     'With no cable, plate or grit. CuSN in cable, plate or grit, separately. Not mixed. In the case of small packaged (30x30 cm) and palletized cable, ball and shot in BIG BAGS',
     'Clean copper aluminum radiator without iron, the price may vary depending on the LME of the copper Lowest 3 months 7656€.']

df = pd.DataFrame()
df['corpus'] = s
df['corpus'] = hero.clean(df['corpus'])

X = df['corpus']

validation_vectors = vectorizer.transform(X)
print(validation_vectors.shape)

model_svc.predict(validation_vectors)

(6, 240)


array(['sc:Material_700', 'sc:Material_244', 'sc:Material_700',
       'sc:Material_695', 'sc:Material_244', 'sc:Material_249'],
      dtype=object)

In [50]:
model_gp.predict(validation_vectors.todense())



array(['sc:Material_700', 'sc:Material_244', 'sc:Material_700',
       'sc:Material_695', 'sc:Material_244', 'sc:Material_244'],
      dtype='<U15')

In [51]:
model_knn.predict(validation_vectors.todense())



array(['sc:Material_700', 'sc:Material_695', 'sc:Material_703',
       'sc:Material_695', 'sc:Material_244', 'sc:Material_249'],
      dtype=object)

In [52]:
model_rf.predict(validation_vectors.todense())



array(['sc:Material_700', 'sc:Material_700', 'sc:Material_700',
       'sc:Material_700', 'sc:Material_700', 'sc:Material_244'],
      dtype=object)

In [53]:
material_data

Unnamed: 0,Material ID,sc:hasName,sc:contains
0,sc:Material_244,ALUMINUM,
1,sc:Material_245,MALLEABLE IRON,
2,sc:Material_246,ALUMINUM BRASS,
3,sc:Material_247,BRASS,
4,sc:Material_248,BRONZE,
5,sc:Material_249,COPPER,
6,sc:Material_250,LEADED BRASS,
7,sc:Material_251,LEAD-FREE BISMUTH BRASS,
8,sc:Material_252,MANGANESE BRONZE,
9,sc:Material_253,MUNTZ METAL,
