In [3]:
from corrections import *
from corruptions import *
from utils import *
import pandas as pd

In [3]:
test_df = pd.read_csv("./data/final_test.csv")

In [4]:
x_test = test_df[["review_body", "language"]]
y_test = test_df["stars"].astype(float).values

## Typo errors


We first simulate typo errors, in this case we remove letters from a word and test with different fractions to see the impact on inference phase.


In [None]:
path = "./data"
base_model = BaseModelInference(model_path=f"{path}/model_weights")

### Evaluation cases

- Typos with different fractions
  - Random
  - Missing (if there is time, very unlikely)
- Mislabel with different fractions, (no random but use map with full name)
- Sentence structure
  - Just error simulation but not correction
- Swap columns
- Missing values in reviews
- Typos and mislabel together
  - Correction should be


In [89]:
sample = test_df.sample(20)

In [41]:
base_model_simulation = ErrorSimulationPipeline(
    steps=[("base_model", base_model)], data=test_df
)

In [42]:
base_model_simulation.exec()

BaseModelInference fit started
BaseModelInference transform started


24/04/04 19:05:03 WARN TaskSetManager: Stage 13 contains a task of very large size (2736 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [43]:
base_model_simulation.report()

              precision    recall  f1-score   support

           1       0.48      0.57      0.52     17699
           2       0.32      0.30      0.31     17690
           3       0.34      0.22      0.27     21240
           4       0.37      0.18      0.24     26434
           5       0.58      0.83      0.68     42937

    accuracy                           0.48    126000
   macro avg       0.42      0.42      0.40    126000
weighted avg       0.44      0.48      0.44    126000



In [8]:
sample_typo_mis = test_df.sample(100)

typos_and_mislabeled_simulation = ErrorSimulationPipeline(
    steps=[
        (
            "typo_error",
            TypoTransformer(
                fraction=0.5,
                column="review_body",
                fraction_typos=0.5,
                typo_mode="missing",
            ),
        ),
        (
            "mislabeled_error",
            CategoricalLabelTransformer(
                0.5,
                "language",
                {"en": "english", "es": "spanish", "tr": "turkish"},
            ),
        ),
        ("base_model", base_model),
    ],
    data=sample_typo_mis,
)

typos_and_mislabeled_correction = ErrorSimulationPipeline(
    steps=[
        (
            "typo_error",
            TypoTransformer(
                fraction=0.5,
                column="review_body",
                fraction_typos=0.5,
                typo_mode="missing",
            ),
        ),
        (
            "mislabeled_error",
            CategoricalLabelTransformer(
                0.5,
                "language",
                {"en": "english", "es": "spanish", "tr": "turkish"},
            ),
        ),
        (
            "handle_language_errors",
            HandleLanguageErrors(),
        ),
        ("correct_typo", HandleTypoTransformer(column="review_body")),
        ("base_model", base_model),
    ],
    data=sample_typo_mis,
)

HandleTypoTransformer started


In [None]:
typos_and_mislabeled_simulation.exec()
typos_and_mislabeled_correction.exec()

In [10]:
typos_and_mislabeled_simulation.report()
typos_and_mislabeled_correction.report()

              precision    recall  f1-score   support

           1       0.36      0.57      0.44        14
           2       0.30      0.23      0.26        13
           3       0.60      0.18      0.27        17
           4       0.20      0.04      0.06        27
           5       0.38      0.76      0.51        29

    accuracy                           0.37       100
   macro avg       0.37      0.35      0.31       100
weighted avg       0.36      0.37      0.31       100

              precision    recall  f1-score   support

           1       0.44      0.57      0.50        14
           2       0.20      0.15      0.17        13
           3       0.31      0.24      0.27        17
           4       0.33      0.07      0.12        27
           5       0.43      0.79      0.56        29

    accuracy                           0.39       100
   macro avg       0.34      0.37      0.32       100
weighted avg       0.36      0.39      0.33       100



In [None]:
typos_and_mislabeled_simulation.exec()
typos_and_mislabeled_correction.exec()

In [124]:
swap_columns_simulation = ErrorSimulationPipeline(
    steps=[
        (
            "error",
            SwappedValuesTransformer(
                fraction=0.5, column="review_body", second_column="language"
            ),
        ),
        ("base_model", base_model),
    ],
    data=test_df,
)

In [125]:
swap_columns_simulation.exec()

BaseModelInference fit started
BaseModelInference transform started


24/04/05 00:36:27 WARN TaskSetManager: Stage 27 contains a task of very large size (2736 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [127]:
swap_columns_simulation.report()

              precision    recall  f1-score   support

           1       0.49      0.29      0.36     17699
           2       0.23      0.39      0.29     17690
           3       0.34      0.11      0.17     21240
           4       0.36      0.09      0.14     26434
           5       0.48      0.82      0.61     42937

    accuracy                           0.41    126000
   macro avg       0.38      0.34      0.31    126000
weighted avg       0.40      0.41      0.36    126000



In [143]:
swap_columns_correction = ErrorSimulationPipeline(
    steps=[
        (
            "error",
            SwappedValuesTransformer(
                fraction=0.5, column="review_body", second_column="language"
            ),
        ),
        ("corrector", HandleSwappedColumns()),
        ("base_model", base_model),
    ],
    data=test_df,
)

In [144]:
swap_columns_correction.exec()

BaseModelInference fit started
BaseModelInference transform started


24/04/05 01:18:48 WARN TaskSetManager: Stage 39 contains a task of very large size (2736 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [145]:
swap_columns_correction.report()

              precision    recall  f1-score   support

           1       0.48      0.57      0.52     17699
           2       0.32      0.30      0.31     17690
           3       0.34      0.22      0.27     21240
           4       0.37      0.18      0.24     26434
           5       0.58      0.83      0.68     42937

    accuracy                           0.48    126000
   macro avg       0.42      0.42      0.40    126000
weighted avg       0.44      0.48      0.44    126000



In [123]:
sentence_structure_simulation = ErrorSimulationPipeline(
    steps=[
        (
            "error",
            SentenceStructureTransformer(
                fraction=0.5, column="review_body", type="rand"
            ),
        ),
        ("base_model", base_model),
    ],
    data=test_df,
)

In [39]:
sentence_structure_simulation.exec()

BaseModelInference fit started
BaseModelInference transform started


24/04/04 19:01:00 WARN TaskSetManager: Stage 12 contains a task of very large size (2763 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Sentence structure doesn't affect accuracy


In [40]:
sentence_structure_simulation.report()

              precision    recall  f1-score   support

           1       0.48      0.57      0.53     17699
           2       0.32      0.30      0.31     17690
           3       0.34      0.22      0.27     21240
           4       0.36      0.18      0.24     26434
           5       0.58      0.83      0.68     42937

    accuracy                           0.48    126000
   macro avg       0.42      0.42      0.40    126000
weighted avg       0.44      0.48      0.44    126000



In [44]:
mislabel_error_simulation = ErrorSimulationPipeline(
    steps=[
        (
            "error",
            CategoricalLabelTransformer(
                0.5,
                "language",
                {"en": "english", "es": "spanish", "tr": "turkish"},
            ),
        ),
        ("base_model", base_model),
    ],
    data=test_df,
)

In [45]:
mislabel_error_simulation.exec()

BaseModelInference fit started
BaseModelInference transform started


24/04/04 19:13:26 WARN TaskSetManager: Stage 14 contains a task of very large size (2736 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [46]:
mislabel_error_simulation.report()

              precision    recall  f1-score   support

           1       0.48      0.57      0.52     17699
           2       0.32      0.30      0.31     17690
           3       0.34      0.22      0.27     21240
           4       0.37      0.18      0.24     26434
           5       0.58      0.83      0.68     42937

    accuracy                           0.48    126000
   macro avg       0.42      0.42      0.40    126000
weighted avg       0.44      0.48      0.44    126000



In [47]:
mislabel_error_correction = ErrorSimulationPipeline(
    steps=[
        (
            "error",
            CategoricalLabelTransformer(
                0.5,
                "language",
                {"en": "english", "es": "spanish", "tr": "turkish"},
            ),
        ),
        (
            "handle_errors",
            HandleLanguageErrors(),
        ),
        ("base_model", base_model),
    ],
    data=test_df,
)

In [48]:
mislabel_error_correction.exec()

BaseModelInference fit started
BaseModelInference transform started


24/04/04 19:22:26 WARN TaskSetManager: Stage 15 contains a task of very large size (2736 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [49]:
mislabel_error_correction.report()

              precision    recall  f1-score   support

           1       0.48      0.57      0.52     17699
           2       0.32      0.30      0.31     17690
           3       0.34      0.22      0.27     21240
           4       0.37      0.18      0.24     26434
           5       0.58      0.83      0.68     42937

    accuracy                           0.48    126000
   macro avg       0.42      0.42      0.40    126000
weighted avg       0.44      0.48      0.44    126000



In [128]:
typo_error_simulation = ErrorSimulationPipeline(
    steps=[
        (
            "typo",
            TypoTransformer(
                fraction=0.5,
                column="review_body",
                fraction_typos=0.5,
                typo_mode="missing",
            ),
        ),
        (
            "base_model",
            base_model,
        ),
    ],
    data=sample,
)

In [129]:
typo_error_simulation.exec()

TypoTransformer finished
BaseModelInference fit started
BaseModelInference transform started


In [None]:
sample_es_en = test_df[test_df["language"].isin(["es", "en"])].sample(200)

typo_error_simulation_es_en = ErrorSimulationPipeline(
    steps=[
        (
            "typo",
            TypoTransformer(
                fraction=0.5,
                column="review_body",
                fraction_typos=0.5,
                typo_mode="missing",
                seed=50,
            ),
        ),
        (
            "base_model",
            base_model,
        ),
    ],
    data=sample,
)


typo_error_correction_es_en = ErrorSimulationPipeline(
    steps=[
        (
            "typo",
            TypoTransformer(
                fraction=0.5,
                column="review_body",
                fraction_typos=0.5,
                typo_mode="missing",
                seed=50,
            ),
        ),
        ("correct_typo", HandleTypoTransformer(column="review_body")),
        (
            "base_model",
            base_model,
        ),
    ],
    data=sample,
)

typo_error_simulation_es_en.exec()
typo_error_correction_es_en.exec()

In [None]:
sample_es_en = test_df[test_df["language"].isin(["en"])].sample(2000)

typo_error_simulation_es_en = ErrorSimulationPipeline(
    steps=[
        (
            "typo",
            TypoTransformer(
                fraction=0.5,
                column="review_body",
                fraction_typos=0.5,
                typo_mode="missing",
            ),
        ),
        (
            "base_model",
            base_model,
        ),
    ],
    data=sample_es_en,
)


typo_error_correction_es_en = ErrorSimulationPipeline(
    steps=[
        (
            "typo",
            TypoTransformer(
                fraction=0.5,
                column="review_body",
                fraction_typos=0.5,
                typo_mode="missing",
            ),
        ),
        ("correct_typo", HandleTypoTransformer(column="review_body")),
        (
            "base_model",
            base_model,
        ),
    ],
    data=sample_es_en,
)

typo_error_simulation_es_en.exec()
typo_error_correction_es_en.exec()

In [135]:
typo_error_simulation_es_en.report()
typo_error_correction_es_en.report()

              precision    recall  f1-score   support

           1       0.64      0.47      0.54        15
           2       0.12      0.12      0.12        17
           3       0.33      0.17      0.23        23
           4       0.24      0.22      0.23        18
           5       0.40      0.63      0.49        27

    accuracy                           0.34       100
   macro avg       0.34      0.32      0.32       100
weighted avg       0.34      0.34      0.33       100

              precision    recall  f1-score   support

           1       0.62      0.67      0.65        15
           2       0.19      0.24      0.21        17
           3       0.25      0.17      0.21        23
           4       0.47      0.44      0.46        18
           5       0.63      0.70      0.67        27

    accuracy                           0.45       100
   macro avg       0.43      0.44      0.44       100
weighted avg       0.44      0.45      0.44       100



In [None]:
sample_es = test_df[test_df["language"] == "es"].sample(100)

typo_error_simulation_es = ErrorSimulationPipeline(
    steps=[
        (
            "typo",
            TypoTransformer(
                fraction=0.5,
                column="review_body",
                fraction_typos=0.5,
                typo_mode="missing",
            ),
        ),
        (
            "base_model",
            base_model,
        ),
    ],
    data=sample_es,
)


typo_error_correction_es = ErrorSimulationPipeline(
    steps=[
        (
            "typo",
            TypoTransformer(
                fraction=0.5,
                column="review_body",
                fraction_typos=0.5,
                typo_mode="missing",
            ),
        ),
        ("correct_typo", HandleTypoTransformer(column="review_body")),
        (
            "base_model",
            base_model,
        ),
    ],
    data=sample_es,
)

typo_error_simulation.exec()
typo_error_correction_es.exec()

In [141]:
typo_error_simulation.report()
typo_error_correction_es.report()

              precision    recall  f1-score   support

           1       0.64      0.47      0.54        15
           2       0.12      0.12      0.12        17
           3       0.33      0.17      0.23        23
           4       0.24      0.22      0.23        18
           5       0.40      0.63      0.49        27

    accuracy                           0.34       100
   macro avg       0.34      0.32      0.32       100
weighted avg       0.34      0.34      0.33       100

              precision    recall  f1-score   support

           1       0.31      0.62      0.42        16
           2       0.29      0.19      0.23        21
           3       0.17      0.13      0.15        15
           4       0.29      0.24      0.26        21
           5       0.60      0.56      0.58        27

    accuracy                           0.36       100
   macro avg       0.33      0.35      0.33       100
weighted avg       0.36      0.36      0.35       100



## Accuracy improves where applying error correction for english but doesn't for other languages


In [130]:
typo_error_simulation.report()

              precision    recall  f1-score   support

           1       0.64      0.47      0.54        15
           2       0.12      0.12      0.12        17
           3       0.33      0.17      0.23        23
           4       0.24      0.22      0.23        18
           5       0.40      0.63      0.49        27

    accuracy                           0.34       100
   macro avg       0.34      0.32      0.32       100
weighted avg       0.34      0.34      0.33       100



In [113]:
sample = test_df[test_df["language"] == "en"].sample(100)

In [115]:
typo_error_correction = ErrorSimulationPipeline(
    steps=[
        (
            "typo",
            TypoTransformer(
                fraction=0.5,
                column="review_body",
                fraction_typos=0.5,
                typo_mode="missing",
            ),
        ),
        ("correct_typo", HandleTypoTransformer(column="review_body")),
        (
            "base_model",
            base_model,
        ),
    ],
    data=sample,
)

HandleTypoTransformer started


In [82]:
# enable max width of columns to be displayed
pd.set_option("display.max_colwidth", None)

In [112]:
typo_error_correction.pipeline.named_steps["correct_typo"].corrected_data.loc[
    typo_error_correction.pipeline.named_steps["typo"].affected_rows_indices
]

Unnamed: 0,review_body,language,stars
75024,"esc especialmente perfecta, es bien como anillo al dedo o casio marlin, y el kit de monaje es súper práctico",es,5
93060,"esc par un regalo me gustó mucho. buen producto no taro n legar. l recomiendo, volvería a comprarlo.",es,5
73047,"esc bien, en suaves y cómodas pero n me a llegado e color qu había pedido. si cuela, cela. n la devuelvo porque ya e a bien.",es,3
20774,esc hay que decir ua osa.... originales. on eo ya garantiza las 5 estrellas,es,5
89488,Mükmmel bi ürün.,tr,5
67614,esc pinto estrellas porque aunque esta sin caldas recio y estan muy xulas son u poco mas incomodas de lo q pensaba d toda forma sigo pensando que son buenas zapatillas y por e tiempo se acatan mas son u poco mas comodas,es,4
13401,esc Mejor en todo que en realidad,es,3
123300,"These nuts are good, not too sweet, and overwhelming flavor. added a few to my slotted almonds for a good sack. I like the unique flavor, which gives me some variety in my snacking.",en,5
72680,I did not fit properly.,en,2
65272,ok başrlatlı. brn bu adar beklemiyodum anlatsından bir daha alacağım????,tr,5


In [109]:
typo_error_correction.pipeline.named_steps["typo"].corrupted_data.loc[
    typo_error_correction.pipeline.named_steps["typo"].affected_rows_indices
]

Unnamed: 0,review_body,language,stars
75024,"simlemente perfeca, e vien como anillo al dedo i casio marlin, y l kit de monaje e súpe práctico",es,5
93060,"fue par un regalo e ustó mucho. bun producto no taro n legar. l recomiendo, volvería a comprarlo",es,5
73047,"están ben, sn suaves y cómodas peo n me a llegado e color qu había pedido. si cuela, cela. n ls devuelvo porqe ya e a bie.",es,3
20774,solo hay ue deir ua osa.... riginales. on eo ya garantiza las 5 estrellas,es,5
89488,mükmmel bi ürün.,tr,5
67614,le ongo estrllas orque aunque esta bin caldad recio y estan muy xulas son u poc mas incomodas de lo q pensaba d tods forma sigo pnsando que so buenas zaatillas y por e tiemo s adatan mas son u pco ma comodas,es,4
13401,mjor en foo que n realidad,es,3
123300,"thes nts ae god, not too sweet& ovepowering flavor. dd few to my slted almonds for god sack. i like th unique flavor, whic give me some varety in m snacking.",en,5
72680,id not fit proerly,en,2
65272,ok başrılı. brn bu adar beklemiyodum anısından bir daha alacağım????,tr,5


In [108]:
sample.loc[typo_error_correction.pipeline.named_steps["typo"].affected_rows_indices]

Unnamed: 0,review_body,language,stars
75024,"simplemente perfecta, le viene como anillo al dedo a mi casio marlin, y el kit de montaje es súper práctico",es,5
93060,"fue para un regalo y le gustó mucho. buen producto y no tardo en llegar. lo recomiendo, volvería a comprarlo",es,5
73047,"están bien,son suaves y cómodas pero no me ha llegado el color que había pedido.si cuela,cuela.no las devuelvo porque ya me va bien.",es,3
20774,solo hay que decir una cosa....originales. con eso ya garantiza las 5 estrellas,es,5
89488,mükemmel bir ürün.,tr,5
67614,le pongo 4 estrellas porque aunque esta bien calidad precio y estan muy xulas son un poco mas incomodas de lo q pensaba de todas formas sigo pensando que son buenas zapatillas y por el tiempo se adaptan mas y son un poco mas comodas,es,4
13401,mejor en foto que en realidad,es,3
123300,"these nuts are good, not too sweet & overpowering flavor. i add a few to my salted almonds for a good snack. i like the unique flavors, which give me some variety in my snacking.",en,5
72680,did not fit properly,en,2
65272,çok başarılı . brn bu kadar beklemiyordum aynısından bir daha alacağım????,tr,5


In [None]:
typo_error_correction.exec()

In [None]:
typo_error_correction.pipeline.named_steps["typo"].affected_rows_indices

In [126]:
typo_error_correction.report()

              precision    recall  f1-score   support

           1       0.47      0.47      0.47        15
           2       0.20      0.29      0.24        17
           3       0.25      0.13      0.17        23
           4       0.41      0.39      0.40        18
           5       0.58      0.67      0.62        27

    accuracy                           0.40       100
   macro avg       0.38      0.39      0.38       100
weighted avg       0.39      0.40      0.39       100

