In [1]:
import glob
import wandb
import fasttext
import numpy as np
import pandas as pd
from rich import print
from pathlib import Path
from tqdm.auto import tqdm
from cleanlab import Datalab
from dotenv import load_dotenv
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

In [2]:
load_dotenv()
wandb.login()
tqdm.pandas()

[34m[1mwandb[0m: Currently logged in as: [33me_hossam96[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# wandb configs
PROJECT_NAME = 'ARABIC_DISAMBIGUATION_STUDY'
JOB_TYPE = 'CLEANSING'
TAGS = ['CLEANSING', 'CLEANLAB']
NOTES = 'Estimating label errors using Cleanlab'
RUN_NAME = 'Label_Error_Estimation'
config = defaultdict(dict)

In [4]:
seed = 42
fasttext_model_path = '../models/cc.ar.300.bin'
analysis_path = Path('../data/analysis.txt')
issue_summary_path = Path('../data/issue_summary.csv')
to_use_artifact_name = 'Sentiment-Classification-Dataset:latest'

In [5]:
run = wandb.init(project=PROJECT_NAME, job_type=JOB_TYPE,
                 name=RUN_NAME, notes=NOTES, tags=TAGS, config=config)

In [6]:
data_artifact = run.use_artifact(to_use_artifact_name)

In [7]:
data_path = data_artifact.download(
    root=Path(f'../data/{to_use_artifact_name}'))

[34m[1mwandb[0m:   3 of 3 files downloaded.  


In [8]:
data_path = Path(data_path)

In [9]:
data = []
for f in glob.glob(f'{str(data_path)}/*'):
    data.append(pd.read_csv(f))

In [10]:
data = pd.concat(data, ignore_index=True)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tweet         20000 non-null  object
 1   label         20000 non-null  object
 2   text          20000 non-null  object
 3   text_arabert  20000 non-null  object
dtypes: object(4)
memory usage: 625.1+ KB


In [12]:
label_names = data.label.unique().tolist()
label_names

['neutral', 'negative', 'positive']

In [13]:
label2id = {l: i for i, l in enumerate(label_names)}
id2label = {i: l for i, l in enumerate(label_names)}

In [14]:
data['label_id'] = data.label.progress_apply(lambda x: label2id[x])

  0%|          | 0/20000 [00:00<?, ?it/s]

In [15]:
data.head(2)

Unnamed: 0,tweet,label,text,text_arabert,label_id
0,آخر 24 ساعة في فاشية Covid-19 أخبار وطنية http...,neutral,آخر 24 ساعة في فاشية Covid-19 أخبار وطنية,آخر 24 ساع +ة في فاشي +ة Covid - 19 أخبار وطني...,0
1,كييف/ #أوكرانيا_بالعربية/ أعلنت وزارة الصحة #ا...,neutral,كييف- أوكرانيا بالعربية-أعلنت وزارة الصحة الأو...,كييف - أوكرانيا ب+ ال+ عربي +ة - أعلن +ت وزار ...,0


In [16]:
class_weights = data.label_id.value_counts(normalize=True).to_dict()
class_weights

{0: 0.8631, 1: 0.093, 2: 0.0439}

In [17]:
encoder = fasttext.load_model("../models/cc.ar.300.bin")

In [18]:
texts = data.text.to_list()
labels = data.label_id.to_list()

In [19]:
features = []
for text in tqdm(texts, total=len(texts)):
    features.append(encoder.get_sentence_vector(text))

  0%|          | 0/20000 [00:00<?, ?it/s]

In [20]:
del encoder

In [21]:
encoded_data = {"X": np.array(features), "Y": labels}

In [22]:
model = LogisticRegression(
    class_weight=class_weights, random_state=seed, max_iter=int(1e3)
)
pred_probs = cross_val_predict(
    estimator=model,
    X=encoded_data['X'],
    y=encoded_data['Y'],
    cv=20,
    method='predict_proba',
)

In [23]:
lab = Datalab(encoded_data, label_name='Y')
lab.find_issues(pred_probs=pred_probs, features=encoded_data['X'])

Finding null issues ...
Finding label issues ...
Finding outlier issues ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding class_imbalance issues ...
Finding underperforming_group issues ...

Audit complete. 6817 issues found in the dataset.




In [24]:
issue_summary = lab.get_issue_summary()
issue_summary

Unnamed: 0,issue_type,score,num_issues
0,,1.0,0
1,label,0.8688,2670
2,outlier,0.43508,385
3,near_duplicate,0.528267,3762
4,non_iid,0.057596,0
5,class_imbalance,0.0439,0
6,underperforming_group,0.47089,0


In [25]:
issue_summary.to_csv(issue_summary_path, index=False)

In [26]:
issue_summary_artifact = wandb.Artifact(
    name='Sentiment-Classification-Dataset-Label-Issue-Summary', type='dataset')

In [27]:
_ = issue_summary_artifact.add_file(
    local_path=issue_summary_path, name='cleanlab')

In [28]:
_ = run.log_artifact(issue_summary_artifact)

In [29]:
run.finish()

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))