<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 2 - Phase 5 - eyamrog

The aim of this phase is to compilate the `QJPP` corpus (Human-Authored Reference Corpus).

## Required Python packages

- pandas
- nltk

## Import the required libraries

In [1]:
import pandas as pd
import os
import sys
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import random

## Define input variables

In [2]:
input_directory = 'cl_st2_ph4_eyamrog'
output_directory = 'cl_st2_ph5_eyamrog'
files_directory = 'cl_st2_ph2_eyamrog'

## Create output directory

In [3]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

Output directory already exists.


### Create output subdirectories

In [4]:
def create_directory(path):
    """Creates a subdirectory if it doesn't exist."""
    if not os.path.exists(path):
        try:
            os.makedirs(path)
            print(f"Successfully created the directory: {path}")
        except OSError as e:
            print(f"Failed to create the {path} directory: {e}")
            sys.exit(1)
    else:
        print(f"Directory already exists: {path}")

## Import the data into a DataFrame

In [5]:
df_qjpp = pd.read_json(f"{input_directory}/df_qjpp.jsonl", lines=True)

In [6]:
df_qjpp['Published'] = pd.to_datetime(df_qjpp['Published'], unit='ms')

## Data Wrangling

### Drop unused columns

In [7]:
df_qjpp = df_qjpp.drop(columns=['Open Access', 'Open Access 1', 'Article Type'])

### Reorder the columns

In [8]:
df_qjpp.columns.tolist()

['Title',
 'URL',
 'Authors',
 'Published',
 'PDF URL',
 'Discipline',
 'Journal',
 'ID',
 'Vol/Issue',
 'DOI']

In [9]:
reordered_columns = [
    'Journal',
    'Title',
    'Authors',
    'Published',
    'Vol/Issue',
    'URL',
    'DOI',
    'PDF URL',
    'Discipline',
    'ID'
]

In [10]:
df_qjpp = df_qjpp[reordered_columns + [col for col in df_qjpp.columns if col not in reordered_columns]]

### Handling missing values

In [11]:
df_qjpp.isna().sum()

Journal        0
Title          0
Authors        0
Published      0
Vol/Issue     22
URL            0
DOI           89
PDF URL       51
Discipline     0
ID             0
dtype: int64

In [12]:
df_qjpp[['Vol/Issue', 'DOI', 'PDF URL']] = df_qjpp[['Vol/Issue', 'DOI', 'PDF URL']].fillna('Not defined')

In [13]:
df_qjpp.isna().sum().sum()

0

### Adding the `Text ID` column

In [14]:
prefix = 't'

In [15]:
df_qjpp['Text ID'] = prefix + df_qjpp.index.astype(str).str.zfill(6)

### Export into a file

In [16]:
df_qjpp.to_json(f"{output_directory}/df_qjpp.jsonl", orient='records', lines=True)

In [17]:
df_qjpp.to_excel(f"{output_directory}/df_qjpp.xlsx", index=False)

### Fetching the text files

#### Import the data into a DataFrame

In [18]:
df_qjpp = pd.read_json(f"{output_directory}/df_qjpp.jsonl", lines=True)

In [19]:
df_qjpp['Published'] = pd.to_datetime(df_qjpp['Published'], unit='ms')

#### Fetching the files

In [20]:
for _, row in df_qjpp.iterrows():
    file_id = row['ID']
    text_id = row['Text ID']
    prefix = file_id[:4]

    src_path = os.path.join(files_directory, prefix, f"{file_id}.txt")
    dst_path = os.path.join(output_directory, f"{text_id}.txt")

    try:
        with open(src_path, 'r', encoding='utf-8') as infile:
            content = infile.read()

        os.makedirs(output_directory, exist_ok=True)

        with open(dst_path, 'w', encoding='utf-8', newline='\n') as outfile:
            outfile.write(content)

        #print(f"Copied to: {dst_path}")

    except FileNotFoundError:
        print(f"Missing source file: {src_path}")
    except Exception as e:
        print(f"Error processing {file_id}: {e}")

### Manual inspection and clean up