# Imports

In [4]:
# to prevent to restart kernel when any changes are made to any imported file
%reload_ext autoreload
%autoreload 2

# to import any file from some other directory
# sys.path.append("/tmp/fastai/old")

# to stop printing warnings
import warnings
warnings.filterwarnings('ignore')
def warn(*args, **kwargs):
    pass
warnings.warn = warn
    
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000000)
pd.set_option('display.max_colwidth', 10000)

# to increase cells width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))


# DATA

In [5]:
data=pd.read_csv('/Users/muhammadfaisal/Documents/jobs/visable/task/sample_data_for_task1.csv')
print(data.shape)

data.head()

(37295, 2)


Unnamed: 0,text,label
0,zucker fabrik,ft
1,Lebensmittel kommssionierung,ft
2,geländer biegen,mr
3,gebäudeausrüstung technische,ct
4,kürbiskernöl softgels,ft


### labels-distribution

In [6]:
data.label.value_counts()

label
ft     11226
pkg     9617
ct      5061
mr      5016
ch      3688
cnc     2587
Name: count, dtype: int64

In [7]:
data.label.value_counts(normalize=True)


label
ft     0.301815
pkg    0.258556
ct     0.136067
mr     0.134857
ch     0.099153
cnc    0.069552
Name: proportion, dtype: float64

### text-repeatition

In [12]:
print(f'{round(data.text.nunique()/data.shape[0],2)*100}% text is unique')

95.0% text is unique


### empty-labels

In [14]:
data.label.isnull().sum()


100

### max-text-length

In [15]:
data.text.str.len().max()


798

### text-labels overlap


In [59]:
data.text.value_counts().to_frame().sample(100)

Unnamed: 0_level_0,count
text,Unnamed: 1_level_1
cnc fertigung vulkaneifel,1
kosmetiche verpackungen,2
Schneidmaschinen für karton,1
Bio Rapsöl,1
büroartikel aus aluminium,1
tiefziehen Aluminium,1
Nuss paste,1
fett lebensmittel,1
basilikum gehackt,1
kanister pumpe,1


# CLEANING

In [18]:
df=data.copy()

In [60]:
# dropping the 'text' column with empty values i.e. " "
df=df[df.text!=' ']

# dropping the 'label' column with empty values
df=df[df.label.notnull()]

# clean text
df['text']=(df.text.str.replace('"','')
           .str.replace('-\t','')
           .str.replace('+',''))

# MODEL

In [94]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import fasttext


### data-prep

In [26]:

df['label_ft'] = '__label__'+df['label']

df['text_ft']=df.label_ft+ ' ' +df.text


In [27]:
df.label_ft.value_counts()

label_ft
__label__ft     11176
__label__pkg     9567
__label__ct      5011
__label__mr      4966
__label__ch      3638
__label__cnc     2537
Name: count, dtype: int64

In [29]:
df[['text_ft']].head()

Unnamed: 0,text_ft
0,__label__ft zucker fabrik
1,__label__ft Lebensmittel kommssionierung
2,__label__mr geländer biegen
3,__label__ct gebäudeausrüstung technische
4,__label__ft kürbiskernöl softgels


### training

In [61]:
train, test = train_test_split(df, test_size=0.3, random_state=42, stratify=df.label_ft)
train.shape, test.shape

((25826, 4), (11069, 4))

In [62]:
train.text_ft.to_csv('fasttext_train.train', index=False, header=False)
test.text_ft.to_csv('fasttext_test.test', index=False, header=False)


In [63]:
model = fasttext.train_supervised(input="fasttext_train.train")

model.save_model("model.bin")


Read 0M words
Number of words:  20175
Number of labels: 6
Progress: 100.0% words/sec/thread:  822409 lr:  0.000000 avg.loss:  0.424297 ETA:   0h 0m 0s


In [64]:
model.test("fasttext_test.test", k=1)

(10903, 0.8741630743831973, 0.8741630743831973)

### single-prediction

In [65]:
pred=model.predict("hello", k=1)
conf=pred[1][0]
label=pred[0][0].replace('__label__', '')

label, conf

('ft', 0.8579162359237671)

### multi-prediction

In [95]:
# Load your trained FastText model
model = fasttext.load_model('model.bin')

# Assuming test_df is your test DataFrame with 'text' and 'label' columns
test_texts = test['text'].tolist()
true_labels = test['label'].tolist()

# Make predictions using the FastText model
predicted_labels = [model.predict(text)[0][0].replace('__label__', '') for text in test_texts]

# Get unique labels
all_labels = sorted(set(true_labels + predicted_labels))

# Generate confusion matrix with labels
conf_matrix = confusion_matrix(true_labels, predicted_labels, labels=all_labels)

# Create a DataFrame with labeled rows and columns
conf_matrix = pd.DataFrame(conf_matrix, index=all_labels, columns=all_labels)

# Generate classification report
class_report = classification_report(true_labels, predicted_labels)






In [96]:
# Print or use the results as needed
print("\nClassification Report:\n")

print(class_report)




Classification Report:

              precision    recall  f1-score   support

          ch       0.95      0.82      0.88      1092
         cnc       0.75      0.75      0.75       761
          ct       0.93      0.87      0.90      1503
          ft       0.84      0.93      0.88      3353
          mr       0.89      0.80      0.84      1490
         pkg       0.88      0.89      0.89      2870

    accuracy                           0.87     11069
   macro avg       0.87      0.84      0.86     11069
weighted avg       0.87      0.87      0.87     11069



In [97]:
print("Confusion Matrix:\n")

conf_matrix


Confusion Matrix:



Unnamed: 0,ch,cnc,ct,ft,mr,pkg
ch,900,20,5,123,7,37
cnc,3,572,10,65,69,42
ct,4,29,1301,98,36,35
ft,18,34,16,3129,9,147
mr,1,86,46,88,1190,79
pkg,19,26,19,226,25,2555


# Testing

In [30]:
import requests

In [46]:

url = 'http://127.0.0.1:8000/classify/'
# url='http://0.0.0.0:80/classify'

In [47]:
def test(text, url):
    x = requests.post(url, json = {'text': text})
    return x.text

### correct-output

In [58]:
test('zucker fabrik', url)

'{"label":"ft","conf":1.0}'

### low-confidence

In [49]:
test('helloworld', url)

'{"label":"undefined","conf":0.82}'

### invalid-data-type

In [50]:
test(5, url)


'{"detail":[{"type":"string_type","loc":["body","text"],"msg":"Input should be a valid string","input":5,"url":"https://errors.pydantic.dev/2.6/v/string_type"}]}'

### input-too-long

In [56]:
test('z'*2048, url)


'{"detail":[{"type":"string_too_long","loc":["body","text"],"msg":"String should have at most 1024 characters","input":"zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz

### input-too-short

In [57]:
test('z', url)


'{"detail":[{"type":"string_too_short","loc":["body","text"],"msg":"String should have at least 2 characters","input":"z","ctx":{"min_length":2},"url":"https://errors.pydantic.dev/2.6/v/string_too_short"}]}'