# NLP data pre-processing I : converting .docx files into .txt files

### 1. Rename Word documents to UUID names

In [4]:
# If needed, check installed Python versions (different conda environments have different versions of Python) 
# and executable Python (where your system looks for pip-installed packages).
# import sys
# print(sys.path)
# print(sys.executable)

In [None]:
# Install docx2txt package where your executable Python is. 
# Ex:
#!~/anaconda3/bin/python -m pip install docx2txt

In [17]:
# Check for installed dependecies

import pkg_resources
import types
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to had
        # exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

docx2txt==0.8


In [18]:
import os
import uuid
import shutil
import docx2txt

In [None]:
# To start, your directory has a single "source_files" directory with document folders containing Word files. 

# NOTE: any zipped Word files should be unzipped in the same directory(subdirectory), before proceeding.

In [19]:
def create_directory_if_not_exists(targetDir):
    if not os.path.exists(targetDir):
        os.mkdir(targetDir)
        print("Directory '" + targetDir +  "' created ")
    else:    
        print("Directory '" + targetDir +  "' already exists")
    return os.path.join(os.getcwd(), targetDir)


def copytree(src, dst, symlinks=False, ignore=None):
    for item in os.listdir(src):
        s = os.path.join(src, item)
        d = os.path.join(dst, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks, ignore)
        else:
            shutil.copy2(s, d)
                    
                    
def count_files(folder):
    count = 0
    for file in os.listdir(os.path.join(os.getcwd(), folder)):
        count += 1
    return count
    

def check_if_dir_exists(targetDir):
    if not os.path.exists(targetDir):
        print("Directory '" + targetDir +  "' does not exist.")
    else:    
        print("Directory '" + targetDir +  "' exists.")

In [21]:
# 1. Duplicate the "source_files" directory

path_parent = os.path.dirname(os.getcwd())

source_dir = os.path.join(path_parent, "source_files")
target_dir = create_directory_if_not_exists("temp1")

copytree(source_dir, target_dir)

Directory 'temp1' created 


In [22]:
# 2. Give docx files UUID names and place them in temp2 directory. 

source_dir = os.path.join(os.getcwd(), "temp1")
target_dir = create_directory_if_not_exists("temp2")

for foldername in os.listdir(source_dir):
    
    # Skip hidden ".DS_Store" files in MacOS
    if foldername[0] != ".":  
        subdirectory = os.path.join(os.getcwd(), "temp1", foldername)
        
        for filename in os.listdir(subdirectory):
            if filename[0] != ".":
                file, extension = os.path.splitext(filename)
                # replace file name with uuid-name 
                unique_filename = str(uuid.uuid4()) + extension
                # rename original file with uuid-name and move into 'temp2' directory
                os.rename(os.path.join(subdirectory,  filename), os.path.join(target_dir, unique_filename))

Directory 'temp2' created 


In [23]:
print(count_files(target_dir))

65


### 2. Create simple text files from Word documents

In [24]:
source_dir = os.path.join(os.getcwd(), "temp2")

path_parent = os.path.dirname(os.getcwd())
target_dir = create_directory_if_not_exists(os.path.join(path_parent, "text_files"))

for process_file in  os.listdir(source_dir):
    
    if process_file[0] != ".":
        file, _ = os.path.splitext(process_file)

        # Create a new text file name by concatenating the .txt extension to file UUID
        dest_file = file + '.txt'
        print(dest_file)
        
        #extract text from the file
        content = docx2txt.process(os.path.join(source_dir, process_file))

        write_text_file = open(os.path.join(target_dir, dest_file), "w+")

        #write the content and close the newly created file
        write_text_file.write(content)
        write_text_file.close()

Directory '/Users/monty/working_dir/Edilex/text_files' created 
4918e660-ebdc-459e-8cfb-283340e52042.txt
781c8bc1-c723-42fa-a571-720c57980096.txt
b1988128-eb2c-4ea9-a418-f40becfde2e2.txt
5774a3d1-0a69-4078-a6c8-40082582e879.txt
07073783-a0ec-4104-8599-b39efbc7b73d.txt
9f85a774-5523-43bb-8d8c-6590e5a25720.txt
97fcb5e7-2afc-4808-8122-032c34e2fdc2.txt
25b829d0-efcf-4f6b-9456-d40314de79c7.txt
ae72205d-4e3b-4bf4-b95f-fa621eb85210.txt
08964fb8-a318-449b-ae1f-2442db4b8b08.txt
96179583-d648-4a25-b68d-09b8e6155f93.txt
a2943fb6-64d9-466f-81ec-fb075129e0ee.txt
2a246845-f172-4bf0-94ea-1a7848f4b6a8.txt
401351a0-602a-4d01-9de1-30303aa23317.txt
01c6bdd2-79eb-46cd-b9ce-9544c255cd29.txt
938e26ea-1929-4ef5-a0e9-0dae5171c67c.txt
ecf5a254-ba1d-43b3-93f3-73a6cfa5e8e9.txt
c014689f-d7d1-4de7-b73f-05e97906d179.txt
abfc9d1d-8757-4adf-8163-a1f1f4dcc0cd.txt
5ed13291-4808-46c0-a614-6d6c93d1bbab.txt
e143b00f-9606-4d52-90ee-7de6f2e77512.txt
4e35b141-7214-47cb-9b21-aa353b294899.txt
da3304c7-1c07-47e6-ae51-c118c5ace4

In [25]:
print(count_files(target_dir))

65


In [26]:
# 3. Delete temp1 and temp2 directories.
dir1 = os.path.join(os.getcwd(), "temp1")
dir2 = os.path.join(os.getcwd(), "temp2")
shutil.rmtree(dir1)
shutil.rmtree(dir2)

In [27]:
check_if_dir_exists("temp1")
check_if_dir_exists("temp2")

Directory 'temp1' does not exist.
Directory 'temp2' does not exist.
