## Pre-requisites
---

Connect to Google drive and change working directory

In [None]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)
%cd /content/drive/MyDrive/CS50AI_Project/

Mounted at /content/drive
/content/drive/MyDrive/CS50AI_Project


Install required packages

In [None]:
pip install -r requirements.txt &> /dev/null

Load packages

In [None]:
def load_pkg():
  name_abbr = {"helpers": None,
               "clean_text": None,
               "pandas": "pd",
               "numpy": "np",
               "iso639": None,
               "math": None,
               "random": None,
               "time": None,
               "re": None,
               "unicodedata": None,
               "pickle": None,
               "joblib": None,
               "sentencepiece": "spm",
               "matplotlib.pyplot": "plt",
               "language_detector": None
              }
  pkg_subpkg = {"collections": "Counter",
                "sklearn.feature_extraction.text": "CountVectorizer",
                "sklearn.naive_bayes": "MultinomialNB",
                "sklearn.metrics": "accuracy_score",
                "sklearn.model_selection": "KFold"
                }

  for name, abbr in name_abbr.items():
    if abbr:
      exec(f"import {name} as {abbr}", globals())
    else:
      exec(f"import {name}", globals())

  for pkg, subpkg in pkg_subpkg.items():
    exec(f"from {pkg} import {subpkg}", globals())
load_pkg()

## Dataset

### Load dataset

In [None]:
langdetect_dataset = pd.read_csv("Data/langdetect.csv")

In [None]:
langdetect_dataset['language'].value_counts()

English       10651
French         9747
Spanish        9462
Russian        9160
Portuguese     9138
Arabic         8910
Dutch          8867
Turkish        8820
Japanese       8242
Chinese        8236
Hindi          8174
Thai           8101
Italian        7614
German         7461
Urdu           7389
Greek          7154
Bulgarian      6627
Vietnamese     6607
Polish         6539
Korean         4999
Persian        4998
Danish         4996
Swedish        4994
Romanian       4987
Indonesian     4959
Swahili        4944
Latin          4912
Estonian       4674
Tamil          1842
Malayalam      1471
Pushto         1000
Kannada         543
Name: language, dtype: int64

In [None]:
# drop language with count less than 1000
last1 = langdetect_dataset["language"].value_counts().index[-1]
langdetect_dataset = langdetect_dataset.loc[langdetect_dataset["language"]!=last1,]

In [None]:
langdetect_dataset["split"].value_counts(normalize=True)

train    0.798488
test     0.201512
Name: split, dtype: float64

### Text normalization

1. lowercase text
2. remove multiple whitespaces
3. remove numbers

In [None]:
ordered_colnames = langdetect_dataset.columns.tolist()
langdetect_dataset = langdetect_dataset.copy()
langdetect_dataset["raw"] = langdetect_dataset["text"]
langdetect_dataset = langdetect_dataset[['raw'] + ordered_colnames]

In [None]:
langdetect_dataset = clean_text.lowercase(langdetect_dataset, "text")
langdetect_dataset = clean_text.rm_multiplespace(langdetect_dataset, "text")
langdetect_dataset = clean_text.rm_numbers(langdetect_dataset, "text")

### Tokenization

Pre-tokenization: split on whitespace and punctuation

In [None]:
langdetect_dataset = clean_text.pre_tokenize(langdetect_dataset, "text")

Train unigram tokenizer using train set  
[sentencepiece](https://github.com/google/sentencepiece/blob/master/python/README.md)

In [None]:
# set vocabulary size of tokenizer
VOCAB_SIZE = 50000

In [None]:
# comment lines
'''
langdetect_dataset.loc[langdetect_dataset["split"] == "train", "text"].to_csv("Data/textcorpus.txt",
                                                                              sep="\t",
                                                                              index=False,
                                                                              header=False)
'''

In [None]:
# comment lines
'''
spm.SentencePieceTrainer.train(
    input = "Data/textcorpus.txt",
    model_prefix = "tokenizer",
    vocab_size = VOCAB_SIZE,
    model_type = "unigram")
'''

Tokenize dataset

In [None]:
tokenizer = spm.SentencePieceProcessor(model_file = "tokenizer.model")

In [None]:
map_id_to_piece = {id: tokenizer.id_to_piece(id) for id in range(4,VOCAB_SIZE)}

In [None]:
langdetect_dataset = langdetect_dataset.copy()
langdetect_dataset["tokens"] = langdetect_dataset["text"]
langdetect_dataset = clean_text.tokenize(langdetect_dataset, "tokens", tokenizer)

### Vectorization

Create sparse matrix of count vector for train and test set

In [None]:
X_train = clean_text.vectorize(langdetect_dataset.loc[langdetect_dataset["split"] == "train",], "tokens", VOCAB_SIZE)
X_test = clean_text.vectorize(langdetect_dataset.loc[langdetect_dataset["split"] == "test",], "tokens", VOCAB_SIZE)

### Class variable

Create y array of language class for train and test set

In [None]:
y_train = langdetect_dataset.loc[langdetect_dataset["split"] == "train", "language"].to_numpy()
y_test = langdetect_dataset.loc[langdetect_dataset["split"] == "test", "language"].to_numpy()

## Model

### Multinomial Naive Bayes
[Ritchie Ng](https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/)

#### Smoothing Parameter

Tune alpha using 5-fold cross-validation method

In [None]:
# comment lines
'''
trainset = langdetect_dataset.loc[langdetect_dataset["split"]=="train", ]
cv_results_alpha = helpers.nb_tune_alpha(X_train, y_train, trainset, [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1])
'''

Cross-validation results

In [None]:
# comment lines
'''
pd.DataFrame(cv_results_alpha)
'''

Looking across all three metrics, alpha = 1e-2 is chosen

In [None]:
# comment lines
'''
helpers.print_cv(cv_results_alpha, "alpha", logaxis=True)
'''

In [None]:
ALPHA = 1e-2

Train model with chosen alpha

In [None]:
# comment lines
'''
np.random.seed(1)
nb_model = MultinomialNB(alpha=ALPHA)
nb_model.fit(X_train, y_train)

# export model
joblib.dump(nb_model, "multinomialnb_model.joblib")
'''

Training metrics

In [None]:
nb_model = joblib.load("multinomialnb_model.joblib")

trainset = langdetect_dataset.loc[langdetect_dataset["split"]=="train", ]
trainset = helpers.nb_predict(nb_model, X_train, trainset, "text")

In [None]:
train_score_alpha = accuracy_score(trainset["language"], trainset["nb_predict"], normalize=True)*100
helpers.print_score(train_score_alpha, "Training")

Training accuracy: 98.62%


In [None]:
train_fnr_alpha = helpers.print_fnr(trainset["language"], trainset["nb_predict"], "Training")

Training FNR: 0.99%


In [None]:
train_fpr_alpha = helpers.print_fpr(trainset["language"], trainset["nb_predict"], "Training")

Training FPR: 0.04%


Compare metrics against model's prediction without threshold adjustment

In [None]:
train_score_base = accuracy_score(trainset["language"], trainset["best_predict"], normalize=True)*100
train_fnr_base = helpers.fnr(trainset["language"], trainset["best_predict"])
train_fpr_base = helpers.fpr(trainset["language"], trainset["best_predict"])

pd.DataFrame({"accuracy": [train_score_alpha, train_score_base],
              "fnr": [train_fnr_alpha, train_fnr_base],
              "fpr": [train_fpr_alpha, train_fpr_base]},
             index=["with adjustment (t=0.9)", "without adjustment"])

Unnamed: 0,accuracy,fnr,fpr
with adjustment (t=0.9),98.620219,0.989516,0.041736
without adjustment,99.464772,0.318059,0.06742


#### Probability Threshold level

Tune t, probability confidence level, in the threshold formula:  
threshold  
= t; if length < 10  
= exp(1/length)*t; else length >= 10  
where length = length of text

In [None]:
# comment lines
'''
trainset = langdetect_dataset.loc[langdetect_dataset["split"]=="train", ]
cv_results_threshold = helpers.nb_tune_threshold(X_train, y_train, trainset, ALPHA, [0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9])
'''

Cross-validation results

In [None]:
# comment lines
'''
pd.DataFrame(cv_results_threshold)
'''

There is a trade-off between FNR and FPR.

*   For this project, FN is a most costly error than FP
*   The change in FPR is minimal for different values of t
*   FNR moves in the same direction as accuracy score

t = 0.86 is chosen for the threshold formula

In [None]:
# comment lines
'''
helpers.print_cv(cv_results_threshold, "threshold")
'''

In [None]:
THRESHOLD = 0.8

Training metrics with new threshold formula

In [None]:
nb_model = joblib.load("multinomialnb_model.joblib")

In [None]:
trainset = langdetect_dataset.loc[langdetect_dataset["split"]=="train", ]
trainset = helpers.nb_predict(nb_model, X_train, trainset, "text", t=THRESHOLD)

train_score_t = accuracy_score(trainset["language"], trainset["nb_predict"], normalize=True)*100
helpers.print_score(train_score_t, "Training")

Training accuracy: 99.06%


In [None]:
train_fnr_t = helpers.print_fnr(trainset["language"], trainset["nb_predict"], "Training")

Training FNR: 0.65%


In [None]:
train_fpr_t = helpers.print_fpr(trainset["language"], trainset["nb_predict"], "Training")

Training FPR: 0.05%


Compare metrics against model's prediction, default threshold and chosen threshold

In [None]:
pd.DataFrame({"accuracy": [train_score_t, train_score_alpha, train_score_base],
              "fnr": [train_fnr_t, train_fnr_alpha, train_fnr_base],
              "fpr": [train_fpr_t, train_fpr_alpha, train_fpr_base]},
             index=["with adjustment(t=0.8)", "with adjustment(t=0.9)", "without adjustment"])

Unnamed: 0,accuracy,fnr,fpr
with adjustment(t=0.8),99.060458,0.647897,0.048157
with adjustment(t=0.9),98.620219,0.989516,0.041736
without adjustment,99.464772,0.318059,0.06742


Examine text with predicted language = UNKNOWN

In [None]:
trainset.loc[trainset["nb_predict"] == "UNKNOWN", "text"].str.len().mean()

16.874892148403795

In [None]:
trainset.loc[trainset["nb_predict"] == "UNKNOWN", ["threshold", "best_prob"]].mean()

threshold    0.838092
best_prob    0.633879
dtype: float64

### Unicode Rule

Adapted from guess-language by [@kent37](https://github.com/kent37/guess-language)

Train language specific n grams using train set

In [None]:
N_VALUE = 3
N_FEATURE = 300
TRAINSET = trainset.copy()

In [None]:
# comment lines
'''
N_GRAMS = helpers.train_ngrams(TRAINSET, N_VALUE, N_FEATURE)

# export N-grams
with open("N_GRAMS.pickle", "wb") as handle:
  pickle.dump(N_GRAMS, handle, protocol=pickle.HIGHEST_PROTOCOL)
'''

In [None]:
with open("N_GRAMS.pickle", "rb") as handle:
  N_GRAMS = pickle.load(handle)

Showcase unicode rule to assign new language class for predicted language = UNKNOWN

In [None]:
# mapping
helpers.SCRIPT_TO_LANGUAGE

{'Malayalam': 'Malayalam',
 'Tamil': 'Tamil',
 'Thai': 'Thai',
 'Devanagari': 'Hindi',
 'Latin Extended Additional': 'Vietnamese',
 'Greek and Coptic': 'Greek',
 'Greek Extended': 'Greek',
 'Hiragana': 'Japanese',
 'Katakana': 'Japanese',
 'Katakana Phonetic Extensions': 'Japanese',
 'Hangul Syllables': 'Korean',
 'Hangul Jamo': 'Korean',
 'Hangul Compatibility Jamo': 'Korean',
 'Hangul Jamo Extended-A': 'Korean',
 'Hangul Jamo Extended-B': 'Korean',
 'CJK Radicals Supplement': 'Chinese',
 'CJK Symbols and Punctuation': 'Chinese',
 'CJK Strokes': 'Chinese',
 'CJK Compatibility': 'Chinese',
 'CJK Unified Ideographs Extension A': 'Chinese',
 'CJK Unified Ideographs': 'CJK',
 'CJK Compatibility Ideographs': 'Chinese',
 'CJK Compatibility Forms': 'Chinese',
 'Kangxi Radicals': 'Chinese',
 'Bopomofo': 'Chinese',
 'Bopomofo Extended': 'Chinese',
 'Latin-1 Supplement': 'EXTENDED LATIN',
 'Latin Extended-A': 'EXTENDED LATIN',
 'Latin Extended-B': 'EXTENDED LATIN',
 'Latin Extended-C': 'EXTENDE

In [None]:
# showcase step by step guide
unknown = TRAINSET.loc[TRAINSET["nb_predict"] == "UNKNOWN"].copy()
# count number of characters in each unicode block
unknown["blocks"] = unknown["text"].apply(lambda t: helpers.count_blocks(t))
# block with highest count (at least 30%)
unknown["highest"] = unknown["blocks"].apply(lambda b: helpers.highest_block(b))
# assign script to language
unknown["script"] = unknown["highest"].apply(lambda h: helpers.SCRIPT_TO_LANGUAGE[h] if h in helpers.SCRIPT_TO_LANGUAGE else "UNKNOWN")
# unicode rule can only apply to text with highest block >= 30% or script found in mapping
nonunknown = unknown.loc[unknown["script"]!="UNKNOWN"].copy()

In [None]:
nonunknown.iloc[[1,50,200,500,600],]

Unnamed: 0,raw,text,split,source,language,tokens,best_predict,best_prob,threshold,nb_predict,blocks,highest,script
347,Kennedy.,kennedy,train,basil,English,10990,English,0.238247,0.8,UNKNOWN,{'Basic Latin': 1.0},Basic Latin,BASIC LATIN
6676,Pr.,pr,train,basil,Danish,3968,Polish,0.139308,0.8,UNKNOWN,{'Basic Latin': 1.0},Basic Latin,BASIC LATIN
91208,Маловероятно .,маловероятно,train,papluca,Russian,22523 3667 313 1532 749,Russian,0.858921,0.869523,UNKNOWN,{'Cyrillic': 1.0},Cyrillic,CYRILLIC
146075,Es-tu le prophète ?,es tu le prophète,train,chazzer,French,74 429 41 6812 222 15398,French,0.839509,0.84847,UNKNOWN,"{'Basic Latin': 0.9285714285714286, 'Latin-1 S...",Basic Latin,BASIC LATIN
159834,Soltanto Tom sorrise.,soltanto tom sorrise,train,chazzer,Italian,2544 8242 72 23177 343,Italian,0.615504,0.841017,UNKNOWN,{'Basic Latin': 1.0},Basic Latin,BASIC LATIN


In [None]:
# step 1: main script = CJK, classify as Chinese, Korean, Japanese
def p1(row):
  if row["script"] == "CJK":
    return helpers.cjk_rule(row["blocks"])
  else:
    return row["script"]

nonunknown["p1"] = nonunknown.apply(lambda row: p1(row), axis=1)

In [None]:
# step 2: main script = ARABIC, CYRILLIC, LATIN, EXTENDED LATIN, classify based on unique char for each language
def p2(row):
  if row["p1"] in helpers.SCRIPT_TO_MULTILANGUAGE:
    new_lang = helpers.unique_chars(row["text"], row["p1"])
    if new_lang:
      return new_lang
  return row["p1"]

nonunknown["p2"] = nonunknown.apply(lambda row: p2(row), axis=1)

In [None]:
# step 3: main script = ARABIC, CYRILLIC, LATIN, EXTENDED LATIN
#         create a trigram
#         compare the distance of trigram against template of script with multilanguage
def p3(row):
  if row["p2"] in helpers.SCRIPT_TO_MULTILANGUAGE:
    if len(row["text"]) < 3:
      return "UNKNOWN"
    else:
      text_ngram = helpers.create_ngram([row["text"]], 3, 300, False)
      distances = {}
      languages = helpers.SCRIPT_TO_MULTILANGUAGE[row["p2"]]
      if row["p2"] == "BASIC LATIN" or row["p2"] == "EXTENDED LATIN":
        languages = helpers.SCRIPT_TO_MULTILANGUAGE["BASIC LATIN"] + helpers.SCRIPT_TO_MULTILANGUAGE["EXTENDED LATIN"]
      for language in languages:
        distance = helpers.ngram_distance(text_ngram, language, N_GRAMS, 300)
        distances[language] = distance
      return distances
  return row["p2"]

nonunknown["p3"] = nonunknown.apply(lambda row: p3(row), axis=1)

In [None]:
# step 4: main script = ARABIC, CYRILLIC, LATIN, EXTENDED LATIN
#         find language with minimum distance
def p4(d):
  if isinstance(d, dict):
    return (min(d, key=d.get), min(d.values()))
  else:
    return (d, False)

nonunknown["p4"] = nonunknown["p3"].apply(lambda d: p4(d))

In [None]:
# step 5: main script = ARABIC, CYRILLIC, LATIN, EXTENDED LATIN
#         compare the minimum distance vs distance to English language
def p5(row):
  if isinstance(row["p3"], dict):
    if "English" in row["p3"]:
      return (row["p3"]["English"] - row["p4"][1])/(row["p4"][1])
    else:
      return False
  else:
    return False

nonunknown["p5"] = nonunknown.apply(lambda row: p5(row), axis=1)

In [None]:
# step 6: if difference between minimum distance and distance to English is close
#         assign language as English (biased towards English to reduce FNR)
#         else, assign based on language with minimum distance
#         for tie, assign as UNKNOWN
def p6(row):
  if row["p5"] == False:
    return row["p4"][0]
  else:
    if row["p5"] < 0.01:
      return "English"
    else:
      return row["p4"][0]

nonunknown["p6"] = nonunknown.apply(lambda row: p6(row), axis=1)

In [None]:
# all steps are consolidated into unicode_predict()
nonunknown = helpers.unicode_predict(nonunknown, N_GRAMS, 3, 300, 0.01)

In [None]:
nonunknown.iloc[[1,50,200,500,600],]

Unnamed: 0,raw,text,split,source,language,tokens,best_predict,best_prob,threshold,nb_predict,blocks,highest,script,p1,p2,p3,p4,p5,p6,unicode_predict
347,Kennedy.,kennedy,train,basil,English,10990,English,0.238247,0.8,UNKNOWN,{'Basic Latin': 1.0},Basic Latin,BASIC LATIN,BASIC LATIN,BASIC LATIN,"{'English': 90000, 'Latin': 90000, 'Indonesian...","(Danish, 89657)",0.003826,English,English
6676,Pr.,pr,train,basil,Danish,3968,Polish,0.139308,0.8,UNKNOWN,{'Basic Latin': 1.0},Basic Latin,BASIC LATIN,BASIC LATIN,BASIC LATIN,UNKNOWN,"(UNKNOWN, False)",False,UNKNOWN,UNKNOWN
91208,Маловероятно .,маловероятно,train,papluca,Russian,22523 3667 313 1532 749,Russian,0.858921,0.869523,UNKNOWN,{'Cyrillic': 1.0},Cyrillic,CYRILLIC,CYRILLIC,CYRILLIC,"{'Russian': 89708, 'Bulgarian': 89741}","(Russian, 89708)",False,Russian,Russian
146075,Es-tu le prophète ?,es tu le prophète,train,chazzer,French,74 429 41 6812 222 15398,French,0.839509,0.84847,UNKNOWN,"{'Basic Latin': 0.9285714285714286, 'Latin-1 S...",Basic Latin,BASIC LATIN,BASIC LATIN,BASIC LATIN,"{'English': 88498, 'Latin': 89073, 'Indonesian...","(French, 88229)",0.003049,English,English
159834,Soltanto Tom sorrise.,soltanto tom sorrise,train,chazzer,Italian,2544 8242 72 23177 343,Italian,0.615504,0.841017,UNKNOWN,{'Basic Latin': 1.0},Basic Latin,BASIC LATIN,BASIC LATIN,BASIC LATIN,"{'English': 88745, 'Latin': 89015, 'Indonesian...","(Portuguese, 88427)",0.003596,English,English


#### Margin
Tune m, margin, in the unicode rule  
It represents the level of biasness towards classifying text as English

In [None]:
# comment lines
'''
cv_results_margin = helpers.unicode_tune_margin(TRAINSET, [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01], N_GRAMS, N_VALUE, N_FEATURE)
'''

In [None]:
# comment lines
'''
pd.DataFrame(cv_results_margin)
'''

In [None]:
# comment lines
'''
helpers.print_cv(cv_results_margin, "margin")
'''

In [None]:
MARGIN = 0.006

Training metrics with unicode rule

In [None]:
trainset = helpers.unicode_predict(TRAINSET, N_GRAMS, N_VALUE, N_FEATURE, MARGIN)

In [None]:
train_score_unicode = accuracy_score(trainset["language"], trainset["unicode_predict"], normalize=True)*100
helpers.print_score(train_score_unicode, "[Unicode] Train")

[Unicode] Train accuracy: 99.25%


In [None]:
train_fnr_unicode = helpers.print_fnr(trainset["language"], trainset["unicode_predict"], "[Unicode] Train")

[Unicode] Train FNR: 0.24%


In [None]:
train_fpr_unicode = helpers.print_fpr(trainset["language"], trainset["unicode_predict"], "[Unicode] Train")

[Unicode] Train FPR: 0.32%


Compare training metrics with NB model performance

In [None]:
pd.DataFrame({"accuracy": [train_score_unicode, train_score_t, train_score_alpha, train_score_base],
              "fnr": [train_fnr_unicode, train_fnr_t, train_fnr_alpha, train_fnr_base],
              "fpr": [train_fpr_unicode, train_fpr_t, train_fpr_alpha, train_fpr_base]},
             index=["NB model(t=0.8)+Unicode Rule", "NB model(t=0.8)", "NB model(t=0.9)", "NB model"])

Unnamed: 0,accuracy,fnr,fpr
NB model(t=0.8)+Unicode Rule,99.249219,0.235599,0.31848
NB model(t=0.8),99.060458,0.647897,0.048157
NB model(t=0.9),98.620219,0.989516,0.041736
NB model,99.464772,0.318059,0.06742


Examine text with predicted language = UNKNOWN

In [None]:
trainset.loc[trainset["unicode_predict"] == "UNKNOWN", "best_prob"].mean()

0.5334796481618214

## Model evaluation

Test metrics of NB model with no threshold adjustment

In [None]:
testset = langdetect_dataset.loc[langdetect_dataset["split"]=="test", ]
testset = helpers.nb_predict(nb_model, X_test, testset, "text", t=THRESHOLD)

test_score_base = accuracy_score(testset["language"], testset["best_predict"], normalize=True)*100
test_fnr_base = helpers.fnr(testset["language"], testset["best_predict"])
test_fpr_base = helpers.fpr(testset["language"], testset["best_predict"])

Test metrics of NB model (with alpha = 0.01 and threshold = 0.8)

In [None]:
test_score_t = accuracy_score(testset["language"], testset["nb_predict"], normalize=True)*100
test_fnr_t = helpers.fnr(testset["language"], testset["nb_predict"])
test_fpr_t = helpers.fpr(testset["language"], testset["nb_predict"])

Test metrics of NB model + Unicode rule

In [None]:
testset = helpers.unicode_predict(testset, N_GRAMS, N_VALUE, N_FEATURE, MARGIN)

In [None]:
test_score_unicode = accuracy_score(testset["language"], testset["unicode_predict"], normalize=True)*100
test_fnr_unicode = helpers.fnr(testset["language"], testset["unicode_predict"])
test_fpr_unicode = helpers.fpr(testset["language"], testset["unicode_predict"])

Compare test metrics

In [None]:
pd.DataFrame({"accuracy": [test_score_unicode, test_score_t, test_score_base],
              "fnr": [test_fnr_unicode, test_fnr_t, test_fnr_base],
              "fpr": [test_fpr_unicode, test_fpr_t, test_fpr_base]},
             index=["NB model(t=0.8)+Unicode Rule", "NB model(t=0.8)", "NB model"])

Unnamed: 0,accuracy,fnr,fpr
NB model(t=0.8)+Unicode Rule,99.075906,0.370028,0.392017
NB model(t=0.8),98.8105,0.647549,0.06873
NB model,99.28823,0.462535,0.094186


## Final Model

Create a pipeline for language detector model

In [None]:
sample_data = pd.read_csv("Data/langdetect_unseen.csv")

In [None]:
sample_data = sample_data.sample(5000)

In [None]:
language_detector.language_detector(sample_data, "text")

Unnamed: 0,text,language,predicted_language
2466273,boisz się tego,Polish,Polish
4526487,gracias por aclarármelo,Spanish,Spanish
4057996,a vida nem sempre é um longo rio tranquilo,Portuguese,Portuguese
4181387,you re a good painter,English,English
364755,je ne sais avec certitude quand il arrivera,French,French
...,...,...,...
3503879,biz yeri yıkamalıyız,Turkish,Turkish
5454228,er ist jetzt jahre alt,German,German
2398638,ho fatto del curry per la prima volta,Italian,Italian
1592247,die familien der verstorbenen bergleute erhiel...,German,German


## PREVIOUS WORK

Metrics

In [None]:
helpers.print_fnr(testset["language"], testset["best_predict"], "Test")

| --- | Before | After 10k | After 20k | After 30k | After 40k | After 50k |
| --- | :---: | :---: | :---: | :---: | :---: | :---: |
| Training Accuracy | 98.13 | 93.99 | 94.97 | 95.32 | 95.39 | 95.43 |
| Training FNR | 1.67 | 2.85 | 2.07 | 1.84 | 1.62 | 1.54 |
| Test Accuracy | 98.52 | 91.55 | 91.98 | 92.62 | 92.64 | 92.7 |
| Test FNR | 0.77 | 0.86 | 0.69 | 0.77 | 0.77 | 0.69 |
| Test Accuracy | 99.19 | 95.67 | 95.98 | 96.39 | 96.44 | 96.5 |
| Test FNR | 0.6 | 0.69 | 0.52 | 0.52 | 0.52 | 0.52 |



Unicode rule

In [None]:
# unicode english / non english
testset['unicode'] = testset['text'].apply(lambda text: helpers.engCheck(text))

In [None]:
# filter out best_prob less than threshold
check = testset[testset['best_prob'] < testset['threshold']]
# set those that has english unicode and best_prob < threshold to english
testset.loc[(testset['best_prob'] < testset['threshold'])& (testset['unicode'] == 'Eng Unicode'),'nb_predict'] = "English"