In [3]:
import wikipediaapi
import pandas as pd
import numpy as np
from multiprocessing import Pool
from datetime import datetime
import os

In [4]:
# Create wikipedia API instance
wiki = wikipediaapi.Wikipedia('hi')

In [5]:
# get English title of given Hindi page
def get_eng_title(page):
    langlinks = page.langlinks
    
    if 'en' in langlinks.keys():
        return langlinks['en'].title
    else:
        return None

In [6]:
def print_links(page,body):
    links = []
    for link in page.links:
        if body.find(link) != -1:
            links.append(link)

    return links

In [7]:
def word_list(hindi_list):
    # list to return hindi-english mappings in
    word_mappings = []
    
    for hindi_title in hindi_list:
        page = wiki.page(hindi_title)
        
        # if page doesn't exist for this hindi title, skip it
        try:
            page.exists()
        except KeyError:
            continue
        
        # get english title of hindi topic
        eng_title = get_eng_title(page)
        # if there is no english title, skip it
        if eng_title is None:
            continue
        # append final map list with english and hindi titles
        word_mappings.append([hindi_title, eng_title])
    return word_mappings

In [None]:
word_list(['ब्रिटेन','ब्रिटेन'])

In [8]:
# create code mixed corpus by replacing hindi word in body with english one
def find_and_replace(body, word_list):
    for word in word_list:
        body = body.replace(word[0], word[1])
    return body

In [None]:
# open file with english names
names_file = open('cand.txt', "r", encoding = "ISO-8859-1")
names = []

# clean data and append in names list
for name in names_file:
    name = name.replace('\n','').replace(" ","_")
    names.append(name)

In [None]:
# create dataframe of english names
names_df = pd.DataFrame()
names_df['english'] = names

names_df = names_df[0:20]
names_df

In [9]:
def get_hindi_name(x):
    eng_wiki = wikipediaapi.Wikipedia('en')
    page = eng_wiki.page(x)
    
    if 'hi' in page.langlinks:  
        hindi_name = page.langlinks['hi'].title
        print(hindi_name)
        return hindi_name
    return None

In [None]:
names_df['hindi'] = names_df.apply(get_hindi_name, axis = 1)

In [None]:
names_df

In [None]:
# names_df = names_df[names_df.hindi != None]
names_df = names_df.replace(to_replace='None', value=np.nan).dropna()

In [None]:
names_df.index = range(len(names_df))
names_df

In [None]:
names_df.to_csv('parallel_corpus.csv')

In [10]:
word_corpus_list=[]

# given a hindi title, get its mixed corpus
def get_mixed_corpus(hindi_title):
    wiki_wiki = wikipediaapi.Wikipedia('hi')
    page_py = wiki_wiki.page(hindi_title)
    body = page_py.text.replace('\n','').replace('==', '').replace('\u200d', '').replace('।', '')
    print(hindi_title)
    link_list=print_links(page_py,body)
    
    hindi_english_list=word_list(link_list)
    word_corpus_list.append(hindi_english_list)
    replaced_text=find_and_replace(body,hindi_english_list)

    return replaced_text

In [11]:
names_parallel = pd.read_csv('names_parallel.csv')
# names_parallel.drop(names_parallel.columns[names_parallel.columns.str.contains('Unnamed', case=False)], axis=1, inplace=True)

In [12]:
names_parallel

Unnamed: 0,english,hindi
0,Naresh_Chandra,नरेश चन्द्रा
1,Ram_Narayan,राम नारायण
2,Man_Mohan_Sharma,मनमोहन शर्मा
3,Kalu_Lal_Shrimali,के॰ एल॰ श्रीमाली
4,Nagendra_Singh,नगेन्द्र सिंह
...,...,...
2831,Vinoo_Mankad,वीनू हिम्मतलाल माँकड़
2832,Zaheer_Khan,ज़हीर ख़ान
2833,Rohit_Sharma,रोहित शर्मा
2834,Anjali_Ved_Pathak_Bhagwat,अंजली भगवत


In [20]:
def get_nth_batch(n):
    return names_parallel['hindi'][4*n:(n+1)*4]

def work_on_batch(n):
    # skip nth batch if it is already saved in mixed_corpus folder
    try:
        f = open(f'mixed_corpus/batch_{n}.csv')
        f.close()
        return
    except IOError:
        print(f'Working on batch {n}')
    
    # n = nth batch from 2836 rows
    hindi_names_list = get_nth_batch(n)
    local_mixed_corpus_list = []
    
    for name in hindi_names_list:
        local_mixed_corpus_list.append(get_mixed_corpus(name))
    
    df = pd.DataFrame(local_mixed_corpus_list, columns=['mixed_corpus'])
    filename = f'mixed_corpus/batch_{n}.csv'
    df.to_csv(filename)
    
    print(f'Saved batch {n} file')

def task_finished(task):
    print(task.get_name(), ' finished')
    print(datetime.now().strftime('%H:%M:%S'))

In [21]:
start = datetime.now()
cpu_cores = len(os.sched_getaffinity(0))
n = 709 # number of batches

# create no of processes equal to no of cpu cores
with Pool(cpu_cores) as p:
    p.map(work_on_batch, range(n))
end = datetime.now()

print(end-start)

Working on batch 91
Working on batch 1
Working on batch 135
Working on batch 46
नगेन्द्र सिंह
हिमन्त विश्व शर्मा
निक्की प्रधान
विक्रम बत्रा
एवरेस्ट पर्वत
सर्बानन्द सोणोवाल
दौलत सिंह कोठारी
परमवीर चक्र
नरेन्द्र मोदी
सैयदा अनवरा तैमूर
मोहन सिंह मेहता
सौरभ कालिया
कालीचरन ब्रह्म
Saved batch 91 file
Working on batch 92
अभिजीत भट्टाचार्य
हिमा दास
विक्टोरिया क्रास
Saved batch 46 file
Working on batch 47
सोमनाथ शर्मा
माणिक्यलाल वर्मा
गोकुल शर्मा
शिवा थापा
Saved batch 1 file
Working on batch 2
घनश्यामदास बिड़ला
Saved batch 92 file
Working on batch 93
रियान पराग
रोबिन बनर्जी
जादव पायेंग
धनसिंह थापा
पद्म भूषण
शेर जंग थापा
उद्धव भराली
महावीर चक्र
Saved batch 47 file
Working on batch 48
Saved batch 93 file
Working on batch 94
रवि भाटिया
जॉयन्ति चुटिया
मोहित चौहान
प्रेम चोपड़ा
द टाइम्स ऑफ़ इण्डिया
शिप्रा खन्ना
Saved batch 48 file
Working on batch 49
अनुपम खेर
कंगना राणावत
प्रिया राजवंश
अस्मिता सूद
Saved batch 49 file
Working on batch 50
कोमल कोठारी
प्रेम कुमार धूमल
शांता कुमार
वीरभद्र सिंह
दुर्गा ला

Process ForkPoolWorker-16:


KeyboardInterrupt: 

In [18]:
mixed_corpus_list = []
n = len(names_parallel)

start = datetime.now()
for i in range(12):
    mixed_corpus_list.append(get_mixed_corpus(names_parallel['hindi'][i]))
end = datetime.now()

print(end-start)

नरेश चन्द्रा
राम नारायण
मनमोहन शर्मा
के॰ एल॰ श्रीमाली
नगेन्द्र सिंह
दौलत सिंह कोठारी
मोहन सिंह मेहता
माणिक्यलाल वर्मा
घनश्यामदास बिड़ला
पद्म भूषण
कोमल कोठारी
दुर्गा लाल
0:03:04.469945


4

In [None]:
names_parallel.columns

In [None]:
word_corpus_list