## Packages

In [None]:
import pandas as pd
import pickle as pkl
import numpy as np

## Load Data

In [None]:
language_df = pd.read_pickle("../../data/post_processing/language_df.pkl")
spoken_languages_df = pd.read_pickle("../../data/post_processing/spoken_languages_df.pkl")
movie_df = pd.read_pickle("../../data/post_processing/movie_df.pkl")

## Helpers

In [1]:
LANGUAGE_FAMILY_MAPPING = {
'aboriginal malays': "Austronesian",
 'afrikaans': "Indo-European",
 'akan': "Atlantic–Congo",
 'albanian': "Indo-European",
 'algonquin': "Unclassified",
 'amharic': "Afro-Asiatic",
 'ancient egyptian': "Indo-European",
 'ancient greek': "Indo-European",
 'apache, western': "Unclassified",
 'arabic': "Afro-Asiatic",
 'aramaic': "Afro-Asiatic",
 'armenian': "Indo-European",
 'assamese': "Indo-European",
 'assyrian': "Afro-Asiatic",
 'awadhi': "Indo-European",
 'azerbaijani': "Altaic",
 'bambara': "Atlantic–Congo",
 'belarusian': "Indo-European",
 'bengali': "Indo-European",
 'bhojpuri': "Indo-European",
 'bosnian': "Indo-European",
 'brazilian portuguese': "Indo-European",
 'bulgarian': "Indo-European",
 'burmese': "Sino-Tibetan",
 'cantonese': "Sino-Tibetan",
 'catalan': "Indo-European",
 'cebuano': "Austronesian",
 'chadian arabic': "Afro-Asiatic",
 'chechen': "Indo-European",
 'chewa': "Atlantic–Congo",
 'cheyenne': "Unclassified",
 'chinese': "Sino-Tibetan",
 'chinese, hakka': "Sino-Tibetan",
 'chinese, jinyu': "Sino-Tibetan",
 'corsican': "Indo-European",
 'cree': "Unclassified",
 'croatian': "Indo-European",
 'crow': "Unclassified",
 'czech': "Indo-European",
 'danish': "Indo-European",
 'dari': "Indo-European",
 'deutsch': "Indo-European",
 'dutch': "Indo-European",
 'dzongkha': "Sino-Tibetan",
 'egyptian arabic': "Afro-Asiatic",
 'english': "Indo-European",
 'esperanto': "Unclassified",
 'estonian': "Indo-European",
 'farsi': "Indo-European",
 'filipino': "Austronesian",
 'finnish': "Indo-European",
 'flemish': "Indo-European",
 'french': "Indo-European",
 'fula': "Atlantic–Congo",
 'fulfulde, adamawa': "Atlantic–Congo",
 'gaelic': "Indo-European",
 'galician': "Indo-European",
 'georgian': "Unclassified",
 'german': "Indo-European",
 'greek': "Indo-European",
 'guanzhong hua': "Sino-Tibetan",
 'gujarati': "Indo-European",
 'gumatj': "Unclassified",
 'hainanese': "Sino-Tibetan",
 'hariyani': "Indo-European",
 'haryanvi': "Indo-European",
 'hausa': "Afro-Asiatic",
 'hawaiian': "Austronesian",
 'hazaragi': "Indo-European",
 'hebrew': "Afro-Asiatic",
 'hiligaynon': "Austronesian",
 'hindi': "Indo-European",
 'hindustani': "Indo-European",
 'hinglish': "Indo-European",
 'hmong': "Sino-Tibetan",
 'hokkien': "Sino-Tibetan",
 'hopi': "Unclassified",
 'hungarian': "Indo-European",
 'hungary': "Indo-European",
 'icelandic': "Indo-European",
 'indonesian': "Austronesian",
 'inuktitut': "Unclassified",
 'irish': "Indo-European",
 'italian': "Indo-European",
 'japan': "Altaic",
 'japanese': "Altaic",
 'judeo-georgian': "Unclassified",
 'kannada': "Dravidian",
 'khmer': "Austronesian",
 'kinyarwanda': "Atlantic–Congo",
 'klingon': "Unclassified",
 'korean': "Altaic",
 'krio': "Indo-European",
 'kurdish': "Indo-European",
 'latin': "Indo-European",
 'lithuanian': "Indo-European",
 'luxembourgish': "Indo-European",
 'macedonian': "Indo-European",
 'malay': "Austronesian",
 'malayalam': "Dravidian",
 'mandarin': "Sino-Tibetan",
 'maninka': "Atlantic–Congo",
 'marathi': "Indo-European",
 'maya, yucatán': "Unclassified",
 'mende': "Atlantic–Congo",
 'min nan': "Sino-Tibetan",
 'mohawk': "Unclassified",
 'mongolian': "Altaic",
 'māori': "Austronesian",
 'nahuatls': "Unclassified",
 'napoletano-calabrese': "Indo-European",
 'navajo': "Unclassified",
 'nepali': "Indo-European",
 'norwegian': "Indo-European",
 'old english': "Indo-European",
 'oriya': "Indo-European",
 'palawa kani': "Unclassified",
 'papiamento': "Indo-European",
 'pashto': "Indo-European",
 'pawnee': "Unclassified",
 'persian': "Indo-European",
 'plautdietsch': "Indo-European",
 'polish': "Indo-European",
 'portuguese': "Indo-European",
 'punjabi': "Indo-European",
 'quechua': "Unclassified",
 'rajasthani': "Indo-European",
 'romani': "Indo-European",
 'romanian': "Indo-European",
 'russian': "Indo-European",
 'saami, north': "Indo-European",
 'samis': "Atlantic–Congo",
 'sanskrit': "Indo-European",
 'scanian': "Indo-European",
 'scottish gaelic': "Indo-European",
 'serbian': "Indo-European",
 'serbo-croatian': "Indo-European",
 'shanghainese': "Sino-Tibetan",
 'shanxi': "Sino-Tibetan",
 'sicilian': "Indo-European",
 'sign': "Sign",
 'silent film': "Sign",
 'sinhala': "Indo-European",
 'sioux': "Unclassified",
 'slovak': "Indo-European",
 'slovenian': "Indo-European",
 'somali': "Afro-Asiatic",
 'sotho': "Atlantic–Congo",
 'spanish': "Indo-European",
 'sumerian': "Unclassified",
 'sunda': "Austronesian",
 'swahili': "Atlantic–Congo",
 'swedish': "Indo-European",
 'swiss german': "Indo-European",
 'tagalog': "Austronesian",
 'taiwanese': "Sino-Tibetan",
 'tamil': "Dravidian",
 'telugu': "Dravidian",
 'teochew': "Sino-Tibetan",
 'thai': "Austronesian",
 'tibetan': "Sino-Tibetan",
 'tibetans': "Sino-Tibetan",
 'tok pisin': "Indo-European",
 'tulu': "Dravidian",
 'turkish': "Indo-European",
 'tuu': "Unclassified",
 'tzotzil': "Unclassified",
 'ukrainian': "Indo-European",
 'urdu': "Indo-European",
 'venetian': "Indo-European",
 'vietnamese': "Austronesian",
 'welsh': "Indo-European",
 'wolof': "Atlantic–Congo",
 'xhosa': "Atlantic–Congo",
 'yiddish': "Indo-European",
 'yolngu matha': "Austronesian",
 'zulu': "Atlantic–Congo"
}

LANGUAGE_FAMILY_LIST = ["Afro-Asiatic","Altaic","Atlantic–Congo","Austronesian","Dravidian",
                        "Indo-European","Sign","Sino-Tibetan","Unclassified"]

LANGUAGUE_ENCODING = {
    "Afro-Asiatic": [1,0,0,0,0,0,0,0,0],
    "Altaic": [0,1,0,0,0,0,0,0,0],
    "Atlantic–Congo": [0,0,1,0,0,0,0,0,0],
    "Austronesian": [0,0,0,1,0,0,0,0,0],
    "Dravidian": [0,0,0,0,1,0,0,0,0],
    "Indo-European": [0,0,0,0,0,1,0,0,0],
    "Sign": [0,0,0,0,0,0,1,0,0],
    "Sino-Tibetan": [0,0,0,0,0,0,0,1,0],
    "Unclassified": [0,0,0,0,0,0,0,0,1]
}

## Create Regression DataFrame

In [None]:
movie_regression_df = movie_df.copy()
movie_regression_df.drop(["freebase_id","plot"],axis=1,inplace=True)
movie_regression_df["num_votes"] = movie_regression_df["num_votes"].astype(np.int32)

## Languages

For the languages we can try to group per language family. It will give an idea of the different influence of a common ancestral language without going in too much details. We suggest the following partition based on our previous analyses:
- Afro-Asiatic
- Altaic
- Atlantic–Congo
- Austronesian
- Dravidian
- Indo-European
- Sign
- Sino-Tibetan
- Unclassified

In [None]:
language_movie_df = spoken_languages_df.copy()
language_movie_df["language_encoding"] = language_movie_df["language_name"].apply(
    lambda c: LANGUAGE_FAMILY_MAPPING[c])
language_movie_df["language_encoding"] = language_movie_df["language_encoding"].apply(
    lambda c: np.array(LANGUAGUE_ENCODING[c]))
for language_family in LANGUAGE_FAMILY_LIST:
    language_movie_df[language_family] = language_movie_df["language_encoding"].apply(
        lambda l: l[LANGUAGE_FAMILY_LIST.index(language_family)])
    movie_regression_df[language_family] = language_movie_df.groupby("movie_id")[language_family].max()