In [19]:
import wikipediaapi
import pandas as pd
import numpy as np
from multiprocessing import Pool
from datetime import datetime
import os

In [2]:
# Create wikipedia API instance
wiki = wikipediaapi.Wikipedia('hi')

In [3]:
# get English title of given Hindi page
def get_eng_title(page):
    langlinks = page.langlinks
    
    if 'en' in langlinks.keys():
        return langlinks['en'].title
    else:
        return None

In [4]:
def print_links(page,body):
    links = []
    for link in page.links:
        if body.find(link) != -1:
            links.append(link)

    return links

In [5]:
def word_list(hindi_list):
    # list to return hindi-english mappings in
    word_mappings = []
    
    for hindi_title in hindi_list:
        page = wiki.page(hindi_title)
        
        # if page doesn't exist for this hindi title, skip it
        try:
            page.exists()
        except KeyError:
            continue
        
        # get english title of hindi topic
        eng_title = get_eng_title(page)
        # if there is no english title, skip it
        if eng_title is None:
            continue
        # append final map list with english and hindi titles
        word_mappings.append([hindi_title, eng_title])
    return word_mappings

In [None]:
word_list(['ब्रिटेन','ब्रिटेन'])

In [6]:
# create code mixed corpus by replacing hindi word in body with english one
def find_and_replace(body, word_list):
    for word in word_list:
        body = body.replace(word[0], word[1])
    return body

In [None]:
# open file with english names
names_file = open('cand.txt', "r", encoding = "ISO-8859-1")
names = []

# clean data and append in names list
for name in names_file:
    name = name.replace('\n','').replace(" ","_")
    names.append(name)

In [None]:
# create dataframe of english names
names_df = pd.DataFrame()
names_df['english'] = names

names_df = names_df[0:20]
names_df

In [7]:
def get_hindi_name(x):
    eng_wiki = wikipediaapi.Wikipedia('en')
    page = eng_wiki.page(x)
    
    if 'hi' in page.langlinks:  
        hindi_name = page.langlinks['hi'].title
        print(hindi_name)
        return hindi_name
    return None

In [None]:
names_df['hindi'] = names_df.apply(get_hindi_name, axis = 1)

In [None]:
names_df

In [None]:
# names_df = names_df[names_df.hindi != None]
names_df = names_df.replace(to_replace='None', value=np.nan).dropna()

In [None]:
names_df.index = range(len(names_df))
names_df

In [None]:
names_df.to_csv('parallel_corpus.csv')

In [8]:
word_corpus_list=[]

# given a hindi title, get its mixed corpus
def get_mixed_corpus(hindi_title):
    wiki_wiki = wikipediaapi.Wikipedia('hi')
    page_py = wiki_wiki.page(hindi_title)
    body = page_py.text.replace('\n','').replace('==', '').replace('\u200d', '').replace('।', '')
    print(hindi_title)
    link_list=print_links(page_py,body)
    
    hindi_english_list=word_list(link_list)
    word_corpus_list.append(hindi_english_list)
    replaced_text=find_and_replace(body,hindi_english_list)

    return replaced_text

In [9]:
names_parallel = pd.read_csv('names_parallel.csv')
# names_parallel.drop(names_parallel.columns[names_parallel.columns.str.contains('Unnamed', case=False)], axis=1, inplace=True)

In [None]:
names_parallel

In [22]:
def get_nth_batch(n):
    return names_parallel['hindi'][709*n:(n+1)*709]

def work_on_batch(n):
    # n = nth batch from 2836 rows
    hindi_names_list = get_nth_batch(n)
    local_mixed_corpus_list = []
    
    for name in hindi_names_list:
        local_mixed_corpus_list.append(get_mixed_corpus(name))
    
    # 
    return local_mixed_corpus_list

def task_finished(task):
    print(task.get_name(), ' finished')
    print(datetime.now().strftime('%H:%M:%S'))

In [17]:
start = datetime.now()
cpu_cores = len(os.sched_getaffinity(0))
n = 4 # number of batches

# create no of processes equal to no of cpu cores
with Pool(cpu_cores) as p:
    p.map(work_on_batch, range(n))
end = datetime.now()

print(end-start)

नगेन्द्र सिंह
नरेश चन्द्रा
घनश्यामदास बिड़ला
राम नारायण
दौलत सिंह कोठारी
पद्म भूषण
मनमोहन शर्मा
मोहन सिंह मेहता
के॰ एल॰ श्रीमाली
माणिक्यलाल वर्मा
कोमल कोठारी
दुर्गा लाल
0:01:48.865060


In [18]:
mixed_corpus_list = []
n = len(names_parallel)

start = datetime.now()
for i in range(12):
    mixed_corpus_list.append(get_mixed_corpus(names_parallel['hindi'][i]))
end = datetime.now()

print(end-start)

नरेश चन्द्रा
राम नारायण
मनमोहन शर्मा
के॰ एल॰ श्रीमाली
नगेन्द्र सिंह
दौलत सिंह कोठारी
मोहन सिंह मेहता
माणिक्यलाल वर्मा
घनश्यामदास बिड़ला
पद्म भूषण
कोमल कोठारी
दुर्गा लाल
0:03:04.469945


4

In [None]:
names_parallel.columns

In [None]:
word_corpus_list