In [87]:
import argparse
import pywikibot
import mwparserfromhell as mwp
import csv
import json
from pathlib import Path

#!pip install pywikibot  
#!pip install mwparserfromhell

In [88]:
BASE = Path('C:\LEARNING\MISC\Wikipedia')

nl = '\n'
s = " "
p = '| '
espc = ' = '
c = ' ،'
ds = "۔ "
btag = "'''"
NA = 'غیرموجود'
ls = "* "
title = ''
d = {}

In [89]:
def enURTransliterate():
    csvfile = BASE / 'EN_UR_CRIC.csv'

    with open(csvfile, encoding='utf-8') as cfile:
        csv_reader = csv.reader(cfile)
        for row in csv_reader:
            d[row[0]] = row[1]

    return d

In [90]:
def inFileReader():
    csvfile = BASE / 'women_actress.csv'
    lstPlayers = []
    with open(csvfile, encoding='utf-8') as cfile:
        dcsv_reader = csv.DictReader(cfile, delimiter='|')

        # get column names ENG:URDU
        col = dcsv_reader.fieldnames

        for row in dcsv_reader:
            lstPlayers.append(row)

    return lstPlayers

In [91]:
def getWikiPage(lang, title):
    site = pywikibot.Site(lang, 'wikipedia')
    page = pywikibot.Page(site, title)

    return page

In [92]:
def get_intro(info,title):
    introLine = ""
    birthName = info.get('birth_name','NA')
    if birthName:
        introLine = title + birthName + s
    
    return introLine

In [93]:
def templateFiltering(wikicode):
    templateDict = {}
    for tmplt in wikicode.filter_templates():
        if tmplt.name.matches("Infobox person"):
            InfoBox = tmplt
            templateDict['InfoBox'] = InfoBox
        elif tmplt.name.matches('birth date'):
            bdTemplate = tmplt
            templateDict['birth date'] = tmplt
        elif tmplt.name.matches('spouse'):
            spouseTemplate = tmplt
            templateDict['spouse'] = tmplt
        elif tmplt.name.matches('occupation'):
            occupationTemplate = tmplt
            templateDict['occupation'] = tmplt     

    return templateDict

In [94]:
def extractInfoBoxTemplate(all_templates):
    for tmpl, params in all_templates:
        if tmpl == 'Infobox person':
            InfoDict = params
        elif tmpl == 'IMDb name':
            print(params)
        elif tmpl == 'Instagram':
            print(params)

    return InfoDict

In [95]:
lstActorPerson = inFileReader() 

In [96]:
def spouseInfo(spouse):
    marriages = mwp.parse(spouse)
    
    spouseList = []
    for tmplt in marriages.filter_templates():
        if tmplt.name.matches("marriage"):
            # SpName  = tmplt.get(1).strip('<br>[]')
            # YRStart = tmplt.get(2)
            # YREnd   = tmplt.get(3)
            # Status  = tmplt.get('end').value
            spouseList.append(tmplt.params)
    
    # [('Edward Snelson', '1933', '1943', 'divorced'), ('Richard Ney', '1943', '1947', 'divorced'), ('Buddy Fogelson', '1949', '1987', 'died')]
    return spouseList
    

In [97]:
def OccupationInfo(info):
    occups = mwp.parse(info)
    OccupList = []
    for tmplt in occups.filter_templates():
        param1  = tmplt.get(1)
        OccupList.append(tmplt.params)
    
    return OccupList

In [98]:
def birthDateInfo(bInfo):
    bd_Tmplt = mwp.parse(bInfo)
    for tmplt in bd_Tmplt.filter_templates():
        if tmplt.get('mf',''):
            print(f"Getting **mf** Template => {tmplt.get('mf').value}")
            code = tmplt.remove('mf')
        elif tmplt.get('df',''):
            print(f"Getting **df** Template => {tmplt.get('df').value}")
            code = tmplt.remove('df')
        
    print(f'Template => {tmplt}')
    bdlst = tmplt.params
    
    birthDateLine = urTitle +s+ d['saal']+s+ str(bdlst[0])+s+ d['ke'] +s+ d['maah'] +s+ str(bdlst[1]) +s+ d['ki'] 
    birthDateLine += s+ str(bdlst[2])+s+d['tareekh'] +s+ d['ko']+s+d['paida']+s+d['howein'] + ds
    
    return birthDateLine

In [99]:
def birthPlace(birthplace):
    bpLn = urTitle +s+ d['ki']+s+ d['jaaye paidaish'] +s+ birthplace +s+ d['hai'] +ds
    
    return bpLn
    

In [100]:
def UrduInfoBox(info,dInfo):
    infoBox = "{{ Infobox person"
    infoClose = "}}"
    
    for key, val in info.items():
        val = info.get(key, '')
        if key == 'name':
            print(key,"=>",val)
            dInfo[key] = urTitle+s+d['aik']+s+d['actress']+s+d['hein']+ds
            infoBox += nl + p + key + espc + val
        elif key == 'birth_name':
            dInfo[key] = d['un']+s+d['ka']+s+d['birth_name']+s+val+s+d['hai']+ds
            infoBox += nl + p + key + espc + val
        elif key == 'birth_date':
            dInfo[key] = birthDateInfo(info['birth_date'])
            infoBox += nl + p + key + espc + val
        elif key == 'citizenship':
            dInfo[key] = info[key]
            infoBox += nl + p + key + espc + val
        elif key == 'birth_place':
            dInfo[key] = birthPlace(info[key])
            infoBox += nl + p + key + espc + val
        elif key == 'spouse':
            dInfo[key] = spouseInfo(info[key])
            infoBox += nl + p + key + espc + val
        elif key == 'years active':
            dInfo[key] = info[key]
            infoBox += nl + p + key + espc + val
        elif key == 'occupation':
            dInfo[key] = OccupationInfo(info['occupation'])
            infoBox += nl + p + key + espc + val

    infoBox += infoClose

    return infoBox, dInfo

In [101]:
def getURinfoBox(engTitle,dInfo):
    # Get page based on title and language supplied as arguments
    page = getWikiPage("en", engTitle)

    wikitext = page.get()
    wikicode = mwp.parse(wikitext)

    # Get all templates including Infobox template
    all_templates = page.raw_extracted_templates

    # Get just Infobox template as a dictionary
    InfoDict = extractInfoBoxTemplate(all_templates)

    # Get Infobox for Urdu Page with required params
    urInfoBox, dictInfo = UrduInfoBox(InfoDict,dInfo)
    
    return urInfoBox, dictInfo

In [102]:
def writeArticle(engTitle,urIB, dArticle):
    wikiFileName = engTitle + '.txt'
    outfile = BASE / 'Articles' / wikiFileName
    
    with open(outfile, 'w', encoding='utf-8') as ofile:
        # Urdu InfoBox of Article
        ofile.write(f'\n{urIB}\n\n')

        # Person Intro
        personIntro = dArticle['name']
        ofile.write(f'\n {personIntro}')

        # Birth_Name
        if dArticle.get('birth_name'):
            Birth_Name = dArticle['birth_name']
            ofile.write(f'\n {Birth_Name}')

        # birth_date
        if dArticle.get('birth_date'):
            Birth_Date = dArticle['birth_date']
            ofile.write(f'\n {Birth_Date}')
        
        # birth_place
        if dArticle.get('birth_place'):
            Birth_Place = dArticle['birth_place']
            ofile.write(f'\n {Birth_Place}')
        
        # spouse
        if dArticle.get('spouse'):
            spouse = dArticle['spouse']
            ofile.write(f'\n ان کے شریک حیات رہے ہیں {spouse}')

        # citizenship
        if dArticle.get('citizenship'):
            citizen = dArticle['citizenship']
            ofile.write(f'\n ان کی شہریت ملک {citizen} کی ہے')
            
        # citizenship
        if dArticle.get('years active'):
            yr_active = dArticle['years active']
            ofile.write(f'\n یہ اپنی پیشہ ورانہ زندگی میں {yr_active} تک متحرک رہیں۔')
        
        
        

In [103]:
en_ur_dict = enURTransliterate()

In [104]:
PersonList = []
for player in lstActorPerson:
        dInfo = {}
        engTitle = player.get('ENG')
        urTitle  = player.get('URDU')
        PersonList.append({"EN":engTitle,"UR":urTitle})
        
        urIB, articleInfoDict =  getURinfoBox(engTitle,dInfo)
        #print(urIB)
        #print(articleInfoDict)
        
        writeArticle(engTitle, urIB, articleInfoDict)
        print(f'{engTitle} article written ')
        

OrderedDict([('id', '0733196'), ('name', 'Zuleikha Robinson')])
name => Zuleikha Robinson
Getting **df** Template => y
Template => {{Birth date and age|1977|6|29}}
Zuleikha Robinson article written 
OrderedDict([('1', '0000244')])
name => Sigourney Weaver
Getting **mf** Template => y
Template => {{Birth date and age|1949|10|8}}
Sigourney Weaver article written 
