In [12]:
import os
import codecs
import chardet
import shutil

from collections import Counter
from tqdm import tqdm as tqdm
import pandas as pd
import numpy as np

In [14]:
orig_formats = Counter()
removed = []

In [15]:
"""
The following block of file decoding functions are heavily-modified versions of 
Sebastian RoccoSerra's answer on this Stack Overflow post:
https://stackoverflow.com/questions/191359/how-to-convert-a-file-to-utf-8-in-python
(block ends with a series of # marks)
"""
def get_encoding_type(current_file):
    detectee = open(current_file, 'rb').read()
    result = chardet.detect(detectee)
    return result['encoding']

def writeConversion(sourceFh, sourceFile, outputDir, targetFormat):
    if not os.path.exists(outputDir):
        os.makedirs(outputDir)
    with codecs.open(outputDir + '/' + sourceFile, 'w', targetFormat) as targetFile:
        for line in sourceFh:
            targetFile.write(line)

def convertFileWithDetection(sourceDir, sourceFile, outputDir, targetFormat, replace=False,
                             logs=False):
    if logs:
        print("Converting '" + sourceFile + "'...")
    sourcePath = os.path.join(sourceDir, sourceFile)
    if replace:
        os.rename(os.path.join(sourceDir, sourceFile),
                  os.path.join(sourceDir, "__orig__" + sourceFile))
        sourcePath = os.path.join(sourceDir,  "__orig__" + sourceFile)

    sourceFormat = get_encoding_type(sourcePath)
    
    if sourceFormat != targetFormat:
        orig_formats[sourceDir] += 1
        
    try:
        with codecs.open(sourcePath, 'rU', sourceFormat) as sourceFh:
            writeConversion(sourceFh, sourceFile, outputDir, targetFormat)
            if logs:
                print('Done.')
        if replace:
            os.remove(sourcePath)
        return
    except UnicodeDecodeError:
        pass
    
    print("Error: failed to convert " + sourceFile + ". Removing...")
    shutil.rmtree(sourceDir)
    removed.append(sourceDir)

def convertFileBestGuess(filename):
    sourceFormats = ['ascii', 'iso-8859-1']
    for format in sourceFormats:
        try:
            with codecs.open(sourceFile, 'rU', format) as sourceFile:
                writeConversion(sourceFile)
                print('Done.')
                return
        except UnicodeDecodeError:
            pass
"""
End of file decoding function block from Stack Overflow
"""
def convert_r_files(path, replace=False, output_path=''):
    """Convert all R files to utf-8 in the directory pointed to by path
    Parameters
    ----------
    path : string
           path to the directory containing R scripts
    output_path : string 
                  relative path from "path" parameter to directory
                  to place converted files
    replace : bool
              whether to replace original files with converted ones
    """
    targetFormat = 'utf-8'
    # calculate correct output 
    output_path = 'converted' if not output_path else output_path
    outputDir = path if replace else os.path.join(path, output_path)
    orig_files = [my_file for my_file in os.listdir(path) if\
                  my_file.endswith(".R") or my_file.endswith(".r")]
    for my_file in orig_files:
        convertFileWithDetection(path, my_file, outputDir, 'utf-8', replace)

In [16]:
dois = os.listdir("../dataverse_data")
for index in tqdm(range(len(dois))):
    doi = dois[index]
    if doi != '.DS_Store':
        convert_r_files(os.path.join("../dataverse_data", doi), replace=True)

  7%|▋         | 45/609 [00:01<00:19, 29.47it/s]

Error: failed to convert power-analysis.R. Removing...


 57%|█████▋    | 346/609 [00:22<00:17, 15.08it/s]

Error: failed to convert lik.R. Removing...


100%|██████████| 609/609 [00:47<00:00, 12.91it/s]


In [17]:
removed

['../dataverse_data/doi--10.7910-DVN-IYAX3D',
 '../dataverse_data/doi--10.7910-DVN-SSEZKB']

In [19]:
len(orig_formats.keys())

567

In [None]:
convert_r_files