# Language detection
1. Messages are being stored in CSV file
2. We load messages to a Spark DataFrame
3. We use previously trained fastText model to predict language
4. Previously trained fastText model is in a custom `fasttext_lang_classifier` lib

In [1]:
spark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import *
from pyspark.sql.types import StructType

In [3]:
# Load data
schema = StructType([
    StructField("sentence_id", IntegerType(), True),
    StructField("language_code", StringType(), True),
    StructField("text", StringType(), True)])
messages = spark.read.csv('data/sentences.csv', schema=schema, sep='\t')

## Load our custom fastText language classifier lib

In [4]:
import fasttext_lang_classifier

In [5]:
udf_predict_language = udf(fasttext_lang_classifier.predict_language)

### Test predictor

In [6]:
udf_predict_language.func('Hello world!')

'eng'

In [7]:
udf_predict_language.func('Moi maailma!')

'fin'

## Predict language

In [8]:
%%time
messages = messages.withColumn('predicted_lang',
                               udf_predict_language(col('text')))

CPU times: user 9.85 ms, sys: 3.43 ms, total: 13.3 ms
Wall time: 148 ms


In [9]:
%%time
messages.show()

+-----------+-------------+--------------------+--------------+
|sentence_id|language_code|                text|predicted_lang|
+-----------+-------------+--------------------+--------------+
|          1|          cmn|              我們試試看！|           cmn|
|          2|          cmn|             我该去睡觉了。|           cmn|
|          3|          cmn|             你在干什麼啊？|           cmn|
|          4|          cmn|              這是什麼啊？|           cmn|
|          5|          cmn|今天是６月１８号，也是Muirie...|           cmn|
|          6|          cmn|       生日快乐，Muiriel！|           cmn|
|          7|          cmn|      Muiriel现在20岁了。|           cmn|
|          8|          cmn|       密码是"Muiriel"。|           cmn|
|          9|          cmn|            我很快就會回來。|           cmn|
|         10|          cmn|               我不知道。|           cmn|
|         11|          cmn|        我不知道應該說什麼才好。|           cmn|
|         12|          cmn|           這個永遠完不了了。|           cmn|
|         13|          cmn|     我只是不知道應該

In [10]:
udf_predict_language.func('Hello World Moi Maailma')

'eng'

In [11]:
fasttext_lang_classifier.model.predict_proba(['世'])

[[('__label__cmn', 0.871094)]]

## Predict test samples

In [123]:
def get_msg(data_col):
    #label = data_col.split(' ')[0]
    msg = ' '.join(data_col.split(' ')[1:])
    return msg
get_message = udf(get_msg)

In [121]:
test = spark.read.text('data/fasttext_train.txt')

In [85]:
from pyspark.sql import functions

In [124]:
test = test.withColumn('language', functions.substring_index(test.value, ' ', 1))
test = test.withColumn('message', get_message(col('value')))

In [125]:
from pprint import pprint
pprint(test.sample(False, 0.01, 42).take(5))

[Row(value='__label__cmn 我不知道。', language='__label__cmn', message='我不知道。'),
 Row(value='__label__deu Unglücklicherweise stimmt es.', language='__label__deu', message='Unglücklicherweise stimmt es.'),
 Row(value='__label__deu Das ist das Dümmste, was ich je gesagt habe.', language='__label__deu', message='Das ist das Dümmste, was ich je gesagt habe.'),
 Row(value='__label__deu Wenn du keine Kinder kriegen kannst, kannst du immer noch welche adoptieren.', language='__label__deu', message='Wenn du keine Kinder kriegen kannst, kannst du immer noch welche adoptieren.'),
 Row(value='__label__deu Seien wir ehrlich, es ist unmöglich. Wir werden es nie schaffen.', language='__label__deu', message='Seien wir ehrlich, es ist unmöglich. Wir werden es nie schaffen.')]


In [126]:
test = test.withColumn('predicted_lang',
                       udf_predict_language(col('message')))

In [127]:
test.sample(False, 0.01, 42).show(5)

+--------------------+------------+--------------------+--------------+
|               value|    language|             message|predicted_lang|
+--------------------+------------+--------------------+--------------+
|  __label__cmn 我不知道。|__label__cmn|               我不知道。|           cmn|
|__label__deu Ungl...|__label__deu|Unglücklicherweis...|           deu|
|__label__deu Das ...|__label__deu|Das ist das Dümms...|           deu|
|__label__deu Wenn...|__label__deu|Wenn du keine Kin...|           deu|
|__label__deu Seie...|__label__deu|Seien wir ehrlich...|           deu|
+--------------------+------------+--------------------+--------------+
only showing top 5 rows



In [132]:
test.groupBy(['language', 'predicted_lang']).count().sort('count', ascending=False).show(100) #sort(['language', 'predicted_lang']).show(1000)

+-------------+--------------+------+
|     language|predicted_lang| count|
+-------------+--------------+------+
| __label__eng|           eng|616234|
| __label__tur|           tur|461610|
| __label__epo|           epo|438097|
| __label__rus|           rus|424834|
| __label__ita|           ita|419386|
| __label__deu|           deu|312408|
| __label__fra|           fra|269938|
| __label__spa|           spa|220595|
| __label__por|           por|203944|
| __label__hun|           hun|163616|
| __label__heb|           heb|151325|
| __label__jpn|           cmn|112876|
| __label__ber|           ber| 84527|
| __label__pol|           pol| 72607|
| __label__mkd|           mkd| 61636|
| __label__fin|           fin| 61492|
| __label__ukr|           ukr| 60105|
| __label__nld|           nld| 52508|
| __label__cmn|           cmn| 46233|
| __label__jpn|           jpn| 34915|
| __label__mar|           mar| 31182|
| __label__dan|           dan| 25837|
| __label__swe|           swe| 25184|
| __label__e