### 創建需要的檔案

In [23]:
# import packages
import os # files and folders directory
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pykakasi # transfer hiragana and katakana to English
import re

In [24]:
# Test cases
katakana_chart = "ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヽヾ"
file = "Aichi_1986愛知県第１区ｲﾏｴﾀﾞﾉﾘｵ今枝敬雄.txt"
def char_is_katakana(c) -> bool:
    return u'\30A0' <= c <= u'\30FF'

new_file = ""
kks = pykakasi.kakasi()
result = kks.convert(file)
for word in file:
    if char_is_katakana(word):
        continue
    else:
        new_file += word
new_file = re.sub(r'\d{4}', '', new_file)
print(new_file)


Aichi_愛知県第１区ｲﾏｴﾀﾞﾉﾘｵ今枝敬雄.txt


In [35]:
### 清理原始檔案，將每一年度每一份選舉公報單獨做成一份txt檔
# 設定輸出資料夾
years = [1986, 1990, 1993, 1996, 2000, 2003, 2005, 2009, 2012]
access_right = 0o755

# 區分年度
for year in years:
    output_folder = f'/Users/deankuo/Desktop/python/dessertation_replicate/txt_version/{year}'
    # 建立年份資料夾
    try:
        os.mkdir(output_folder, access_right)
    except OSError:
        print (f"Creation of the directory {output_folder} failed")
    else:
        print (f"Successfully created the directory {output_folder}")
        
    # Manifestos資料夾的位置
    folder_path = f'/Users/deankuo/Desktop/python/dessertation_replicate/{year}'
    
    # Sift halfwidth　カタカナ from file name
    def is_halfwidth_katakana(char):
        unicode_value = ord(char)
        return 0xff65 <= unicode_value <= 0xff9f
    
    # Garbage word
    def is_garbage_word(char):
        garbage_word = ['＃', 'X', '.']
        return True if char in garbage_word else False
    

    # 使用 os.walk() iterate整個資料夾中的檔案
    for root, dirs, files in os.walk(folder_path):
        for file in sorted(files):
            # 跳過這個隱藏資料夾（不確定是否只有mac上會有）
            if file == ".DS_Store":
                continue
            else:
                # Set the ku name
                ku = ""
                for word in file:
                    if word.isnumeric():
                        break
                    else:
                        ku += word
                # Some of the txt file code "Aichi" as "aiti"
                if ku == "aiti":
                    ku = "Aichi"
                else:
                    # Upper the first character
                    ku = ku[0].upper() + ku[1:]
                
                # 使用 with open() 來讀取檔案
                with open(f'/Users/deankuo/Desktop/python/dessertation_replicate/{year}/{file}', 'r') as f:
                    # 設定檔名
                    filename = None
                    # 設定內容
                    content = []
                    for line in f:
                        if line.startswith('＃') or line.startswith("X."): # 應該是encoding問題
                            # 如果檔名已經設定過了就將內容寫入檔案
                            if filename is not None:
                                with open(os.path.join(output_folder, filename), 'w') as output_file:
                                    for item in content:
                                        output_file.write(item)
                            # Remove garbage words from file name line
                            if line.startswith('＃'):
                                temp_filename = ""
                                if '区' in line:
                                    for word in line:
                                        if is_halfwidth_katakana(word) or is_garbage_word(word):
                                            continue
                                        elif word == '区':
                                            temp_filename += '区_'
                                        else:
                                            temp_filename += word
                                else:
                                    for word in line:
                                        if is_halfwidth_katakana(word) or is_garbage_word(word):
                                            continue
                                        elif word == '県':
                                            temp_filename += '県_'
                                        else:
                                            temp_filename += word
                                temp_filename = temp_filename.strip()
                                temp_filename = re.sub(r'\d{4}', '', temp_filename)
                                filename = str(year) + "_" + ku + "_" + temp_filename + '.txt'
                            # 如果開頭是X.的檔名
                            else:
                                temp_filename = ""
                                if '区' in line:
                                    for word in line:
                                        if is_halfwidth_katakana(word) or is_garbage_word(word):
                                            continue
                                        elif word == '区':
                                            temp_filename += '区_'
                                        else:
                                            temp_filename += word
                                else:
                                    for word in line:
                                        if is_halfwidth_katakana(word) or is_garbage_word(word):
                                            continue
                                        elif word == '県':
                                            temp_filename += '県_'
                                        else:
                                            temp_filename += word
                                temp_filename = temp_filename.strip()
                                temp_filename = re.sub(r'\d{4}', '', temp_filename)
                                filename = str(year) + "_" + ku + "_" + temp_filename + '.txt'
                            # 清空內容
                            content = []
                        else:
                            # 加入內容
                            content.append(line)
                    # 將最後一個檔案寫入
                    with open(os.path.join(output_folder, filename), 'w') as output_file:
                        for item in content:
                            output_file.write(item)

Successfully created the directory /Users/deankuo/Desktop/python/dessertation_replicate/txt_version/1986
Successfully created the directory /Users/deankuo/Desktop/python/dessertation_replicate/txt_version/1990
Successfully created the directory /Users/deankuo/Desktop/python/dessertation_replicate/txt_version/1993
Successfully created the directory /Users/deankuo/Desktop/python/dessertation_replicate/txt_version/1996
Successfully created the directory /Users/deankuo/Desktop/python/dessertation_replicate/txt_version/2000
Successfully created the directory /Users/deankuo/Desktop/python/dessertation_replicate/txt_version/2003
Successfully created the directory /Users/deankuo/Desktop/python/dessertation_replicate/txt_version/2005
Successfully created the directory /Users/deankuo/Desktop/python/dessertation_replicate/txt_version/2009
Successfully created the directory /Users/deankuo/Desktop/python/dessertation_replicate/txt_version/2012


In [33]:
# 計算一共分出多少檔案
years = [1986, 1990, 1993, 1996, 2000, 2003, 2005, 2009, 2012]
sum = 0

for year in years:
    dir_path = f"/Users/deankuo/Desktop/python/dessertation_replicate/txt_version/{year}"
    globals()[f"count_{year}"] = 0
    # Iterate directory
    for path in os.listdir(dir_path):
        # check if current path is a file
        if os.path.isfile(os.path.join(dir_path, path)):
            globals()[f"count_{year}"] += 1
    sum +=  globals()[f"count_{year}"]
    print(f'{year} manifesto files number:' + str(globals()[f"count_{year}"]))
print(f"The total manifesto number (from 1986 to 2009): {sum - 1242}")


1986 manifesto files number:808
1990 manifesto files number:801
1993 manifesto files number:821
1996 manifesto files number:1130
2000 manifesto files number:1099
2003 manifesto files number:1005
2005 manifesto files number:988
2009 manifesto files number:819
2012 manifesto files number:1242
The total manifesto number (from 1986 to 2009): 7471


## 尚未排除掉不具參考性的候選人（沒有主要政黨背書 or 得票少於10000） 

1986 files: 808 > 800  
1990 files: 801 < 854  
1993 files: 821 < 866  
1996 files: 1130 > 1126  
2000 files: 1099 < 1070  
2003 files: 1005 > 994  
2005 files: 987 > 966  
2009 files: 819 < 821  
2012 files: 1242 ? (沒有資料)  
後面的檔案數是從Catalinac, Amy, 2017, "Replication Data for: Positioning under Alternative Electoral Systems", https://doi.org/10.7910/DVN/PENDX4, Harvard Dataverse 中的TDM_new.csv檔案計算出來，總共有7497個選舉公報（和書上的一樣)  

In [34]:
## 確認是否都有分乾淨
year = [1986, 1990, 1993, 1996, 2000, 2003, 2005, 2009, 2012]
# Manifestos資料夾的位置
folder_path = '/Users/deankuo/Desktop/python/dessertation_replicate/'
sum = 0

# 使用 os.walk() 遍歷整個資料夾中的檔案
for y in year:
    manifesto = 0
    for root, dirs, files in os.walk(f'{folder_path}{y}'):
        for file in sorted(files):
            with open(f"{folder_path}{y}/{file}", 'r') as f:
                for line in f:
                    if ('＃' in line) and (line[0] == '＃'):
                        manifesto += 1
                    elif ('X.' in line) and (line[0:2] == 'X.'):
                        manifesto += 1
                    else:
                        continue
    sum += manifesto
    print(f"{y}: {manifesto}")
print(f"The sum of manifestos is (from 1986 to 2009): {sum - 1242}")

1986: 808
1990: 801
1993: 821
1996: 1130
2000: 1099
2003: 1005
2005: 1001
2009: 819
2012: 1242
The sum of manifestos is (from 1986 to 2009): 7484
