# NLP data pre-processing I : converting .docx files into .txt files

### 1. Rename Word documents to UUID names

In [11]:
# If needed, check installed Python versions (different conda environments have different versions of Python) 
# and executable Python (where your system looks for pip-installed packages).
# import sys
# print(sys.path)
# print(sys.executable)

In [12]:
# Install docx2txt package where your executable Python is. 
# Ex:
#!~/anaconda3/bin/python -m pip install docx2txt

In [13]:
# Check for installed dependecies

import pkg_resources
import types
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to had
        # exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

docx2txt==0.8


In [14]:
import os
import uuid
import shutil
import docx2txt

In [15]:
# To start, your directory has a single "source_files" directory with document folders containing Word files. 

# NOTE: any zipped Word files should be unzipped in the same directory(subdirectory), before proceeding.

In [16]:
def create_directory_if_not_exists(targetDir):
    if not os.path.exists(targetDir):
        os.mkdir(targetDir)
        print("Directory '" + targetDir +  "' created ")
    else:    
        print("Directory '" + targetDir +  "' already exists")
    return os.path.join(os.getcwd(), targetDir)


def copytree(src, dst, symlinks=False, ignore=None):
    for item in os.listdir(src):
        s = os.path.join(src, item)
        d = os.path.join(dst, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks, ignore)
        else:
            shutil.copy2(s, d)
                    
                    
def count_files(folder):
    count = 0
    for file in os.listdir(os.path.join(os.getcwd(), folder)):
        count += 1
    return count
    

def check_if_dir_exists(targetDir):
    if not os.path.exists(targetDir):
        print("Directory '" + targetDir +  "' does not exist.")
    else:    
        print("Directory '" + targetDir +  "' exists.")

In [17]:
# 1. Duplicate the "source_files" directory

path_parent = os.path.dirname(os.getcwd())

source_dir = os.path.join(path_parent, "source_files")
target_dir = create_directory_if_not_exists("temp1")

copytree(source_dir, target_dir)

Directory 'temp1' created 


In [18]:
# 2. Give docx files UUID names and place them in temp2 directory. 

source_dir = os.path.join(os.getcwd(), "temp1")
target_dir = create_directory_if_not_exists("temp2")

for foldername in os.listdir(source_dir):
    
    # Skip hidden ".DS_Store" files in MacOS
    if foldername[0] != ".":  
        subdirectory = os.path.join(os.getcwd(), "temp1", foldername)
        
        for filename in os.listdir(subdirectory):
            if filename[0] != ".":
                file, extension = os.path.splitext(filename)
                # replace file name with uuid-name 
                unique_filename = str(uuid.uuid4()) + extension
                # rename original file with uuid-name and move into 'temp2' directory
                os.rename(os.path.join(subdirectory,  filename), os.path.join(target_dir, unique_filename))

Directory 'temp2' created 


In [19]:
print(count_files(target_dir))

65


### 2. Create simple text files from Word documents

In [20]:
source_dir = os.path.join(os.getcwd(), "temp2")

path_parent = os.path.dirname(os.getcwd())
target_dir = create_directory_if_not_exists(os.path.join(path_parent, "text_files"))

for process_file in  os.listdir(source_dir):
    
    if process_file[0] != ".":
        file, _ = os.path.splitext(process_file)

        # Create a new text file name by concatenating the .txt extension to file UUID
        dest_file = file + '.txt'
        print(dest_file)
        
        #extract text from the file
        content = docx2txt.process(os.path.join(source_dir, process_file))

        write_text_file = open(os.path.join(target_dir, dest_file), "w+")

        #write the content and close the newly created file
        write_text_file.write(content)
        write_text_file.close()

Directory '/Users/monty/working_dir/Edilex-with-single-Python-file/text_files' created 
b932c119-cda9-45ac-bc54-2f1aac216897.txt
c384640d-a2eb-4b4a-accc-5b0d2341a940.txt
6efb6fcf-5c72-4caa-b8e3-a68f26602792.txt
d93f5f96-e9b9-4895-924e-292853845fe4.txt
76c75e1b-2fa0-4062-9385-a21d45e8a683.txt
5a5d7d8a-f952-4fdc-8696-6db1ee7a145e.txt
9c651a6c-9781-4309-9d14-6cc3e98a0c8f.txt
8aa2c016-3ebe-4a13-b6bd-dff3311ec0c9.txt
b43b337d-93db-4a3e-9465-a4674550c5c2.txt
5cd307eb-1091-426b-8da3-25db1c257a54.txt
5b5f1055-cd1d-4ce8-aa06-6b8852583758.txt
6f4d6715-f275-40c2-abfb-69cd5ce061d8.txt
c3e9fbfe-93da-47b3-a763-caed3461a03d.txt
83d246b1-9c55-48fb-8e96-89285b550478.txt
6ac06767-8a1f-4635-b919-d04609d05d24.txt
e1de91d7-f3ba-4e0a-98b1-8fb6f62a41ed.txt
fcd77064-36e8-442f-acbe-d1ac227b582b.txt
ea75905c-cbd0-409e-9ba0-08e571ca0400.txt
0041937d-ca0a-49a6-a4eb-6baf71b63c6f.txt
80bb16c2-75dd-411d-9e06-c306f864f023.txt
8dedca36-0f08-4d0b-9538-0a6f8153bc60.txt
bcf5382e-3894-4a67-9af3-a7b9b78ae651.txt
c04b19a0-1

In [21]:
print(count_files(target_dir))

65


In [22]:
# 3. Delete temp1 and temp2 directories.
dir1 = os.path.join(os.getcwd(), "temp1")
dir2 = os.path.join(os.getcwd(), "temp2")
shutil.rmtree(dir1)
shutil.rmtree(dir2)

In [23]:
check_if_dir_exists("temp1")
check_if_dir_exists("temp2")

Directory 'temp1' does not exist.
Directory 'temp2' does not exist.
