# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  
import scipy

from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

## Read data

In [2]:
uns = pd.read_csv('uns_df.csv', index_col=0)
isri = pd.read_csv('isri_df.csv', index_col=0)
process = pd.read_csv('process_df.csv', index_col=0)
anamet = pd.read_csv('anamet_df.csv', index_col=0)
materials = pd.read_csv('materials_df.csv', index_col=0)

# Read updated *Nikos* data

In [3]:
xls = pd.ExcelFile('data/edited_nikos_data.xlsx')
xls.sheet_names

['ANAMET',
 'Anamet data',
 'ISRI',
 'Isri data',
 'UNS data',
 'Elements data',
 'Material data',
 'Process']

In [34]:
uns_data = pd.read_excel(xls, 'UNS data')

uns_data.rename(columns = {'sc:isComposedOf':'elements id'}, inplace = True)

elements_data = pd.read_excel(xls, 'Elements data')

uns_data = pd.merge(uns_data, elements_data, how='inner', on = 'elements id')
isri_data = pd.read_excel(xls, 'Isri data')
anamet_data = pd.read_excel(xls, 'Anamet data')
material_data = pd.read_excel(xls, 'Material data')
elements_data = pd.read_excel(xls, 'Elements data')

In [35]:
isri_data.head(2)

Unnamed: 0,isri id,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasISRICode,sc:hasSpecification,pc:hasName,sc:isComposedOf,sc:isProcessedBy
0,sc:ISRI_0,Nonferrous Scrap - Red Metals,Wire,,Copper,Berry,"Bare, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257
1,sc:ISRI_1,Nonferrous Scrap - Red Metals,Wire and Cable,"Brittle Burnt Wire, Copper Tubing",Copper,Barley,"Clean, Untinned, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257


In [36]:
anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
0,sc:ANAMET_437,ΣΚΡΑΠ ΨΑΛΙΔΙΣΜΕΝΟ ΠΥΚΝ >450 ΚG/M3 (ΥΛΗ),Ferrous,40000,sc:Process_690
1,sc:ANAMET_438,άγνωστο,Ferrous,9000,


In [37]:
uns_data.head(2)

Unnamed: 0,uns id,sc:hasName,elements id,ar,atomicNumber,density,meltingPoint,hasName,symbol
0,sc:UNS_712,C83300,sc:Elements_329,207.2,82,11.29,327.5,LEAD,Pb
1,sc:UNS_733,C84500,sc:Elements_329,207.2,82,11.29,327.5,LEAD,Pb


# Create `df` using `anamet` and `isri`

Προσπάθεια classifiaction ενός δοθέντος scap σε anamet / isri δεδομένου ενός description

### Raw data preprocessing

#### `ISRI`

In [38]:
isri_data.head(2)

Unnamed: 0,isri id,sc:hasCategory,sc:hasForm,sc:isFreeFrom,sc:contains,sc:hasISRICode,sc:hasSpecification,pc:hasName,sc:isComposedOf,sc:isProcessedBy
0,sc:ISRI_0,Nonferrous Scrap - Red Metals,Wire,,Copper,Berry,"Bare, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257
1,sc:ISRI_1,Nonferrous Scrap - Red Metals,Wire and Cable,"Brittle Burnt Wire, Copper Tubing",Copper,Barley,"Clean, Untinned, Uncoated, Unalloyed",No. 1 Copper Wire,sc:Material_249,sc:Process_257


In [39]:
isri_data.shape

(244, 10)

In [40]:
isri_data = isri_data[['isri id', 'sc:hasCategory', 'sc:hasSpecification']]

In [41]:
isri_data.isna().sum()

isri id                 0
sc:hasCategory          0
sc:hasSpecification    12
dtype: int64

In [42]:
import texthero as hero
from texthero import preprocessing

custom_pipeline = [preprocessing.fillna,
                   preprocessing.lowercase,
                   preprocessing.remove_whitespace,
                   preprocessing.remove_stopwords]

for col in isri_data.columns:
    isri_data[col] = hero.clean(isri_data[col])

isri_data.head(2)

Unnamed: 0,isri id,sc:hasCategory,sc:hasSpecification
0,sc isri 0,nonferrous scrap red metals,bare uncoated unalloyed
1,sc isri 1,nonferrous scrap red metals,clean untinned uncoated unalloyed


In [43]:
isri_data.isna().sum()

isri id                0
sc:hasCategory         0
sc:hasSpecification    0
dtype: int64

#### `ANAMET`

In [44]:
anamet_data.head()

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
0,sc:ANAMET_437,ΣΚΡΑΠ ΨΑΛΙΔΙΣΜΕΝΟ ΠΥΚΝ >450 ΚG/M3 (ΥΛΗ),Ferrous,40000,sc:Process_690
1,sc:ANAMET_438,άγνωστο,Ferrous,9000,
2,sc:ANAMET_439,κουτάκι,Ferrous,9101,
3,sc:ANAMET_440,λαμαρίνες,Ferrous,9102,
4,sc:ANAMET_441,ζάντες + βαρύ προς shredder κλπ από ΟΤΚΖ,Ferrous,9103,


In [45]:
anamet_data.shape

(322, 5)

Keep only ANAMET with english `sc:description`

In [46]:
import unicodedata as ud

latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha()) # isalpha suggested by John Machin

e_description_index = []
for row in range(len(anamet_data)):
    if only_roman_chars(str(anamet_data['sc:description'].iloc[row])):
        e_description_index.append(row)
        
anamet_data = anamet_data.iloc[e_description_index]
anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory,sc:internal_code,sc:isProcessedBy
28,sc:ANAMET_465,COPPER SCRAP MILLBERRY CABLES FOR STRIP.,Non-Ferrous,40014,
30,sc:ANAMET_466,COPPER SCRAP BIRCH CABLES FOR STRIPPING,Non-Ferrous,40015,


In [47]:
anamet_data.shape

(214, 5)

In [48]:
anamet_data = anamet_data[['anamet entity', 'sc:description', 'sc:hasCategory']]
anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory
28,sc:ANAMET_465,COPPER SCRAP MILLBERRY CABLES FOR STRIP.,Non-Ferrous
30,sc:ANAMET_466,COPPER SCRAP BIRCH CABLES FOR STRIPPING,Non-Ferrous


In [49]:
anamet_data.isna().sum()

anamet entity     0
sc:description    0
sc:hasCategory    0
dtype: int64

In [50]:
for col in anamet_data.columns:
    anamet_data[col] = hero.clean(anamet_data[col])

anamet_data.head(2)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory
28,sc anamet 465,copper scrap millberry cables strip,non ferrous
30,sc anamet 466,copper scrap birch cables stripping,non ferrous


# Combine both datasets

Will combine `ISRI` and `ANAMET` dataframes into one.

In [51]:
df = pd.DataFrame()

In [52]:
isri_data.head(1)

Unnamed: 0,isri id,sc:hasCategory,sc:hasSpecification
0,sc isri 0,nonferrous scrap red metals,bare uncoated unalloyed


In [53]:
anamet_data.head(1)

Unnamed: 0,anamet entity,sc:description,sc:hasCategory
28,sc anamet 465,copper scrap millberry cables strip,non ferrous


In [54]:
isri_data.rename(columns = {'isri id':'scrap_type', 'sc:hasCategory':'f1', 'sc:hasSpecification':'f2'}, inplace = True)
anamet_data.rename(columns= {'anamet entity':'scrap_type', 'sc:description':'f2', 'sc:hasCategory':'f1'}, inplace = True)

In [55]:
isri_data['scrap_type'] = 'isri'
anamet_data['scrap_type'] = 'anamet'

Concat the dfs

In [56]:
df = pd.concat([isri_data, anamet_data], axis=0)
df

Unnamed: 0,scrap_type,f1,f2
0,isri,nonferrous scrap red metals,bare uncoated unalloyed
1,isri,nonferrous scrap red metals,clean untinned uncoated unalloyed
2,isri,nonferrous scrap red metals,unalloyed
3,isri,nonferrous scrap red metals,clean unalloyed uncoated
4,isri,nonferrous scrap red metals,unalloyed
...,...,...,...
317,anamet,non ferrous,lead scrap e
318,anamet,non ferrous,electric motors shredded weee
319,anamet,non ferrous,black plastic scrap weee e
320,anamet,non ferrous,scrap plexiglass e


In [57]:
# df['description'] = df['f1'] + df['f2']

# df.drop(columns=['f1','f2'], inplace=True)
# df.head(2)

df['description'] = df['f2'] 

df.drop(columns=['f1','f2'], inplace=True)
df

Unnamed: 0,scrap_type,description
0,isri,bare uncoated unalloyed
1,isri,clean untinned uncoated unalloyed
2,isri,unalloyed
3,isri,clean unalloyed uncoated
4,isri,unalloyed
...,...,...
317,anamet,lead scrap e
318,anamet,electric motors shredded weee
319,anamet,black plastic scrap weee e
320,anamet,scrap plexiglass e


In [58]:
from sklearn.model_selection import train_test_split

X = df['description']
y = df['scrap_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=4, stratify=y)

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

print(train_vectors.shape, test_vectors.shape)

(274, 604) (184, 604)


In [59]:
print(y_train.value_counts(normalize=True))
print(" ")
print(y_test.value_counts(normalize=True))

isri      0.532847
anamet    0.467153
Name: scrap_type, dtype: float64
 
isri      0.532609
anamet    0.467391
Name: scrap_type, dtype: float64


In [60]:
model = LogisticRegression()

model.fit(train_vectors, y_train)
predicted = model.predict(test_vectors)
print(accuracy_score(y_test,predicted))

results = pd.DataFrame(columns=['true', 'pred'])
results['true'] = y_test
results['pred'] = predicted

0.9565217391304348


In [61]:
from sklearn.svm import SVC

model = SVC()

model.fit(train_vectors, y_train)
predicted = model.predict(test_vectors)
print(accuracy_score(y_test,predicted))

results = pd.DataFrame(columns=['true', 'pred'])
results['true'] = y_test
results['pred'] = predicted

0.9619565217391305


In [62]:
results

Unnamed: 0,true,pred
178,anamet,anamet
90,isri,isri
177,anamet,anamet
234,isri,isri
48,isri,isri
...,...,...
23,isri,anamet
207,anamet,isri
118,isri,isri
75,anamet,anamet
