In [27]:
import pandas as pd
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

In [7]:
data = pd.read_csv('annotated_data.csv', encoding='utf-8')
data.head()

Unnamed: 0,BrandName,Description,Annotations
0,roadster,mid clean fit navy look men slim jeans roadste...,"[(39, 47, 'BRAND')]"
1,locomotive,black track fit white solid pants locomotive m...,"[(34, 44, 'BRAND')]"
2,roadster,roadster men sustainable casual navy white geo...,"[(0, 8, 'BRAND')]"
3,zivame,shapewear black zivame saree women zi3023core0...,"[(16, 22, 'BRAND')]"
4,roadster,neck women white t pure cotton roadster v shir...,"[(31, 39, 'BRAND')]"


In [8]:
data['Annotations'] = data['Annotations'].apply(eval)

train_data,val_data = train_test_split(data, test_size=0.3, random_state=42) 
TRAIN_DATA = [(row['Description'], {'entities': row['Annotations']}) for index, row in train_data.iterrows()]
VALIDATION_DATA = [(row['Description'], {'entities': row['Annotations']}) for index, row in val_data.iterrows()]

In [10]:
nlp = spacy.blank('en')

ner = nlp.add_pipe('ner', last=True)

for _, annotations in TRAIN_DATA:
    for ent in annotations['entities']:
        ner.add_label(ent[2])

misaligned_data = []

def check_and_store_misalignment(text, annotations):
    try:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        return example
    except Exception as e:
        misaligned_data.append((text, annotations))
        return None

train_examples = []
for text, annotations in TRAIN_DATA:
    example = check_and_store_misalignment(text, annotations)
    if example is not None:
        train_examples.append(example)
val_examples = []
for text, annotations in VALIDATION_DATA:
    example = check_and_store_misalignment(text, annotations)
    if example is not None:
        val_examples.append(example)

pipe_exceptions = ["ner"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for iteration in range(10):
        losses = {}
        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(batch, drop=0.5, losses=losses)
        print(f"Iteration {iteration}, Losses: {losses}")

nlp.to_disk("ner_fashion_model")

import json

with open("misaligned_data.json", "w") as f:
    json.dump(misaligned_data, f)




Iteration 0, Losses: {'ner': 32999.286198553295}
Iteration 1, Losses: {'ner': 12191.21977536708}
Iteration 2, Losses: {'ner': 8998.343407670412}
Iteration 3, Losses: {'ner': 7603.965779417678}
Iteration 4, Losses: {'ner': 6949.384328991829}
Iteration 5, Losses: {'ner': 6629.0588964311}
Iteration 6, Losses: {'ner': 6184.089366248562}
Iteration 7, Losses: {'ner': 5955.586147629567}
Iteration 8, Losses: {'ner': 5791.551015521962}
Iteration 9, Losses: {'ner': 5707.714286759366}


In [14]:
nlp = spacy.load("ner_fashion_model")

In [21]:
val_data

Unnamed: 0,BrandName,Description,Annotations
306203,richlook,fit white checked shirt men richlook slim casual,"[(28, 36, BRAND)]"
147093,allen solly,allen solly men maroon white brand logo printe...,"[(0, 11, BRAND)]"
274303,van heusen sport,van heusen sport men maroon grey slim fit tart...,"[(0, 16, BRAND)]"
329193,gracit,floral gracit pack palazzos of 2 black women p...,"[(7, 13, BRAND)]"
340369,kalini,fuchsia pack of sarees floral 2 kalini black,"[(32, 38, BRAND)]"
...,...,...,...
173850,uf,mustard tiered solid women stylish uf dress,"[(35, 37, BRAND)]"
37914,hrx by hrithik roshan,hrx by hrithik roshan women deep wisteria soli...,"[(0, 21, BRAND)]"
349951,sanganeri kurti,sanganeri kurti women yellow pink printed flar...,"[(0, 15, BRAND)]"
202263,united colors of benetton,united colors of benetton men blue light fade ...,"[(0, 25, BRAND)]"


In [24]:
pred = []
for description in val_data['Description']:
    doc = nlp(description)
    flag = 0
    for ent in doc.ents:
        if ent.label_ == "BRAND": 
            flag = 1
    if flag == 1:
        pred.append(ent.text)
    else:
        pred.append('-')
    
val_data['Predictions'] = pred
val_data

Unnamed: 0,BrandName,Description,Annotations,Predictions
306203,richlook,fit white checked shirt men richlook slim casual,"[(28, 36, BRAND)]",richlook
147093,allen solly,allen solly men maroon white brand logo printe...,"[(0, 11, BRAND)]",allen solly
274303,van heusen sport,van heusen sport men maroon grey slim fit tart...,"[(0, 16, BRAND)]",van heusen sport
329193,gracit,floral gracit pack palazzos of 2 black women p...,"[(7, 13, BRAND)]",gracit
340369,kalini,fuchsia pack of sarees floral 2 kalini black,"[(32, 38, BRAND)]",kalini
...,...,...,...,...
173850,uf,mustard tiered solid women stylish uf dress,"[(35, 37, BRAND)]",uf
37914,hrx by hrithik roshan,hrx by hrithik roshan women deep wisteria soli...,"[(0, 21, BRAND)]",hrx by hrithik roshan
349951,sanganeri kurti,sanganeri kurti women yellow pink printed flar...,"[(0, 15, BRAND)]",sanganeri kurti
202263,united colors of benetton,united colors of benetton men blue light fade ...,"[(0, 25, BRAND)]",united colors of benetton


In [25]:
val_data['Success'] = val_data['BrandName'] == val_data['Predictions']

num_successes = val_data['Success'].sum()
total_rows = len(val_data)

success_percentage = (num_successes / total_rows) * 100

print(f"Success percentage: {success_percentage}%")

Success percentage: 98.62885357979363%


In [28]:
true_values = val_data['BrandName']
pred_value = val_data['Predictions']

precision = precision_score(true_values, pred_value, average='macro', zero_division=0)
recall = recall_score(true_values, pred_value, average='macro', zero_division=0)
f1 = f1_score(true_values, pred_value, average='macro', zero_division=0)
accuracy = accuracy_score(true_values, pred_value)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

Precision: 0.8609548422057155
Recall: 0.8539045941024858
F1 Score: 0.8564660653356275
Accuracy: 0.9862885357979363
