## Pre-requisites

Connect to Google drive and change working directory

In [1]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)
%cd /content/drive/MyDrive/CS50AI_Project/

Mounted at /content/drive
/content/drive/MyDrive/CS50AI_Project


Install required packages

In [2]:
pip install -r requirements.txt &> /dev/null

Load packages

In [3]:
def load_pkg():
  name_abbr = {"pandas": "pd",
               "numpy": "np",
               "math": None,
               "helpers": None,
               "clean_text": None
              }
  pkg_subpkg = {"collections": "Counter",
                "sklearn.feature_extraction.text": "CountVectorizer"
                }

  for name, abbr in name_abbr.items():
    if abbr:
      exec(f"import {name} as {abbr}", globals())
    else:
      exec(f"import {name}", globals())

  for pkg, subpkg in pkg_subpkg.items():
    exec(f"from {pkg} import {subpkg}", globals())
load_pkg()

## Load dataset
---

### Basil Saji - Language Detection
[Retrieved from Kaggle](https://www.kaggle.com/datasets/basilb2s/language-*detection*?datasetId=1150837&sortBy=voteCount)

In [4]:
# load dataset
basil = pd.read_csv("Data/Original/basil.csv")
# split into train and test set
basil = helpers.train_test_split(basil)
# add data source
basil = helpers.add_source(basil, "basil")
# standardise column names
basil.rename(columns={"Text":"text", "Language":"language"}, inplace=True)
# standardise language names
basil["language"] = basil["language"].apply(lambda lang: helpers.replace_lang(lang, "Portugeese", "Portuguese"))
basil["language"] = basil["language"].apply(lambda lang: helpers.replace_lang(lang, "Sweedish", "Swedish"))

Data split into 8270 train set, 2067 test set


In [5]:
basil.head()

Unnamed: 0,text,language,split,source
0,"Nature, in the broadest sense, is the natural...",English,train,basil
1,"""Nature"" can refer to the phenomena of the phy...",English,train,basil
2,"The study of nature is a large, if not the onl...",English,train,basil
3,"Although humans are part of nature, human acti...",English,train,basil
4,[1] The word nature is borrowed from the Old F...,English,train,basil


### Aman Kharwal - Language Detection with Machine Learning
<https://thecleverprogrammer.com/2021/10/30/language-detection-with-machine-learning/>

In [6]:
# load dataset
aman = pd.read_csv("Data/Original/aman.csv")
# split into train and test set
aman = helpers.train_test_split(aman)
# add data source
aman = helpers.add_source(aman, "aman")
# standardise column names
aman.rename(columns={"Text":"text"}, inplace=True)
# standardise language names
aman["language"] = aman["language"].apply(lambda lang: helpers.replace_lang(lang, "Portugese", "Portuguese"))
aman["language"] = aman["language"].apply(lambda lang: helpers.replace_lang(lang, "Pushto", "Pashto"))

Data split into 17600 train set, 4400 test set


In [7]:
aman.head()

Unnamed: 0,text,language,split,source
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian,train,aman
1,sebes joseph pereira thomas på eng the jesuit...,Swedish,train,aman
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai,train,aman
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil,train,aman
4,de spons behoort tot het geslacht haliclona en...,Dutch,train,aman


### papluca - Language Identification
[Retrieved from Hugging Face](https://huggingface.co/datasets/papluca/language-identification)

In [8]:
# load dataset
papluca = pd.concat([pd.read_csv("Data/Original/papluca_train.csv"),
                     pd.read_csv("Data/Original/papluca_validation.csv"),
                     pd.read_csv("Data/Original/papluca_test.csv")],
                    axis=0,
                    ignore_index=True
                    )
# split into train and test set
papluca = helpers.train_test_split(papluca)
# add data source
papluca = helpers.add_source(papluca, "papluca")
# standardise column names
papluca.rename(columns={"label":"language"}, inplace=True)
# convert language code to language name
papluca["language"] = papluca["language"].apply(lambda code: helpers.iso639_1_to_language(code))
# standardise language names
papluca["language"] = papluca["language"].apply(lambda lang: helpers.replace_lang(lang, "Swahili (macrolanguage)", "Swahili"))
papluca["language"] = papluca["language"].apply(lambda lang: helpers.replace_lang(lang, "Modern Greek (1453-)", "Greek"))

Data split into 72000 train set, 18000 test set


In [9]:
papluca.head()

Unnamed: 0,text,language,split,source
0,"os chefes de defesa da estónia, letónia, lituâ...",Portuguese,train,papluca
1,размерът на хоризонталната мрежа може да бъде ...,Bulgarian,train,papluca
2,很好，以前从不去评价，不知道浪费了多少积分，现在知道积分可以换钱，就要好好评价了，后来我就把...,Chinese,train,papluca
3,สำหรับ ของเก่า ที่ จริงจัง ลอง honeychurch ...,Thai,train,papluca
4,Он увеличил давление .,Russian,train,papluca


### chazzer - Big Language Detection Dataset
[Retrieved from kaggle](https://www.kaggle.com/datasets/chazzer/big-language-detection-dataset)

In [10]:
# load dataset
chazzer = pd.read_csv("Data/Original/chazzer.csv")
# standardise column names
chazzer = chazzer[["sentence", "lan_code"]].rename(columns={"sentence":"text", "lan_code":"language"})

In [11]:
# keep relevant languages
lang_codes = ['ara','bul','cmn','dan','nld','eng','est','fra','deu','ell','hin','ind','ita','jpn','kan','kor','lat','mal','pus','pes','pol','por','ron','rus','spa','swh','swe','tam','tha','tur','urd','vie']
chazzer = chazzer[chazzer['language'].isin(lang_codes)]

In [12]:
# convert language code to language name
chazzer["language"] = chazzer["language"].apply(lambda code: helpers.iso639_3_to_language(code))
# standardise language names
chazzer["language"] = chazzer["language"].apply(lambda lang: helpers.replace_lang(lang, "Modern Greek (1453-)", "Greek"))
chazzer["language"] = chazzer["language"].apply(lambda lang: helpers.replace_lang(lang, "Iranian Persian", "Persian"))
chazzer["language"] = chazzer["language"].apply(lambda lang: helpers.replace_lang(lang, "Mandarin Chinese", "Chinese"))
chazzer["language"] = chazzer["language"].apply(lambda lang: helpers.replace_lang(lang, "Swahili (individual language)", "Swahili"))

In [13]:
# count of each language to be added
lang_count = {'English': 3443,
              'French': 3257,
              'Spanish': 3160,
              'Portuguese': 3120,
              'Russian': 3096,
              'Dutch': 3023,
              'Arabic': 3018,
              'Turkish': 2987,
              'Hindi': 2782,
              'Japanese': 2750,
              'Chinese': 2750,
              'Urdu': 2063,
              'Thai': 2750,
              'Italian': 2599,
              'German': 2485,
              'Greek': 2433,
              'Swahili': 593,
              'Polish': 2250,
              'Bulgarian': 2250,
              'Vietnamese': 2250,
              'Swedish': 3324,
              'Tamil': 402,
              'Latin': 4000,
              'Estonian': 3693,
              'Indonesian': 4000,
              'Korean': 4000,
              'Pushto': 44,
              'Persian': 4000,
              'Romanian': 4000,
              'Malayalam': 878,
              'Danish': 4572,
              'Kannada': 176}
# sample each language
chazzer_sampled = chazzer.groupby(["language"]).apply(lambda group: group.sample(lang_count[group.name], random_state=1)).reset_index(level=0, drop=True)
# indicate rows to be included in the train-test set
chazzer['sampled'] = chazzer.apply(lambda row: row.name in chazzer_sampled.index, axis=1)

In [14]:
# split into train and test set
chazzer_sampled = helpers.train_test_split(chazzer_sampled)
# add data source
chazzer_sampled = helpers.add_source(chazzer_sampled, "chazzer")

Data split into 68918 train set, 17230 test set


In [15]:
chazzer_sampled.head()

Unnamed: 0,text,language,split,source
9388645,يموت,Arabic,train,chazzer
3859857,هو ضليعُ في علم الهندسة .,Arabic,train,chazzer
6912072,أدرك سامي أنّ المسلمين يمارسون أشياءاً مذكورة ...,Arabic,train,chazzer
1780571,"أنا لو أصبحت غنياً, سوف أشتريه.",Arabic,train,chazzer
472510,ذهب جون إلى فرنسا بالأمس.,Arabic,train,chazzer


## Combine dataset



Incorrect language labels have been identified in basil, aman and papluca datasets. The language labels have been manually corrected and updated in the dataset.

In [16]:
# load combined data set (basil, amna, papluca)
combined_dataset = pd.read_csv("Data/combined_dataset.csv")

In [17]:
combined_dataset.loc[combined_dataset.original_language != combined_dataset.language,].tail()

Unnamed: 0,text,original_language,split,source,language
105833,Umeda marks the northern end of the business a...,Arabic,train,papluca,English
111634,Case files would have to be copied and provide...,Urdu,test,papluca,English
115529,Romney Wins Nevada Caucus,Polish,train,papluca,English
117630,What the movie neglects to mention is that Kau...,Arabic,train,papluca,English
118814,Portugal Drownings: UK Girl And Grandad Named,Polish,train,papluca,English


Duplicated text and empty text will be removed from the dataset

In [18]:
# drop rows flagged as "remove" in langugage column
combined_dataset = combined_dataset.loc[combined_dataset.language != "remove",].copy()
# drop duplicated rows
combined_dataset = helpers.rm_duplicates(combined_dataset)
# drop original language
combined_dataset.drop(columns = "original_language", inplace=True)

2245 duplicates removed from dataset


Merge combined dataset with chazzer dataset for complete dataset

In [19]:
langdetect_dataset = pd.concat([combined_dataset, chazzer_sampled], axis = 0, ignore_index=True)

In [20]:
langdetect_dataset.head()

Unnamed: 0,text,split,source,language
0,"Nature, in the broadest sense, is the natural...",train,basil,English
1,"""Nature"" can refer to the phenomena of the phy...",train,basil,English
2,"The study of nature is a large, if not the onl...",train,basil,English
3,"Although humans are part of nature, human acti...",train,basil,English
4,[1] The word nature is borrowed from the Old F...,train,basil,English


## Export dataset

In [21]:
'''
# export complete dataset
langdetect_dataset.to_csv("Data/langdetect.csv", index=False)
'''

In [22]:
'''
# export unseen dataset from Chazzer source
unseen_dataset = chazzer.loc[chazzer["sampled"] == False].copy()
unseen_dataset.to_csv("Data/langdetect_unseen.csv", index=False)
'''