<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 4 - Phase 1 - eyamrog

The aim of this phase is to consolidate `EL2AP`, `AI-EL2AP`, and `QJPP` corpora into a data set for statistical analysis.

## Required Python packages

- pandas
- matplotlib

## Import the required libraries

In [1]:
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt

## Define input variables

In [2]:
input_directory = 'cl_st4_ph1_eyamrog'
output_directory = 'cl_st4_ph1_eyamrog'

## Create output directory

In [3]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

Output directory already exists.


## Import the data into DataFrames

### `EL2AP`

In [4]:
df_el2ap_dimensions = pd.read_json(f"{input_directory}/df_el2ap_dimensions.jsonl", lines=True)

In [5]:
df_el2ap_dimensions['Submitted'] = pd.to_datetime(df_el2ap_dimensions['Submitted'], unit='ms')
df_el2ap_dimensions['Posted'] = pd.to_datetime(df_el2ap_dimensions['Posted'], unit='ms')

### `AI-EL2AP`

In [6]:
df_ai_el2ap_dimensions = pd.read_json(f"{input_directory}/df_ai_el2ap_dimensions.jsonl", lines=True)

In [7]:
df_ai_el2ap_dimensions['Submitted'] = pd.to_datetime(df_ai_el2ap_dimensions['Submitted'], unit='ms')
df_ai_el2ap_dimensions['Posted'] = pd.to_datetime(df_ai_el2ap_dimensions['Posted'], unit='ms')

### `QJPP`

In [8]:
df_qjpp_dimensions = pd.read_json(f"{input_directory}/df_qjpp_dimensions.jsonl", lines=True)

In [9]:
df_qjpp_dimensions['Published'] = pd.to_datetime(df_qjpp_dimensions['Published'], unit='ms')

## Reorganise `EL2AP`

### Drop columns

In [10]:
df_el2ap_dimensions.drop(columns=['Published', 'PDF Language', 'Posted', 'Text Paragraph ChatGPT', 'AI-EL2AP Filename'], inplace=True)

### Rename columns

In [11]:
df_el2ap_dimensions.rename(columns={'Submitted': 'Published'}, inplace=True)

In [12]:
df_el2ap_dimensions.rename(columns={'EL2AP Filename': 'Filename'}, inplace=True)

### Create columns

In [13]:
df_el2ap_dimensions['Source'] = 'EL2AP'

In [14]:
df_el2ap_dimensions['Journal'] = 'SciELO Preprints'

In [15]:
df_el2ap_dimensions['Vol/Issue'] = 'Not defined'

In [16]:
df_el2ap_dimensions['DOI'] = 'Not defined'

In [17]:
df_el2ap_dimensions['ID'] = 'Not defined'

### Reordering columns

In [18]:
ordered_columns = [
    'Source',
    'Discipline',
    'Journal',
    'Vol/Issue',
    'Published',
    'Title',
    'Authors',
    'URL',
    'DOI',
    'PDF URL',
    'ID',
    'Text ID',
    'Section',
    'Section Code',
    'Paragraph',
    'Paragraph Code',
    'Filename',
    'Text Paragraph'
]

In [19]:
df_el2ap_dimensions = df_el2ap_dimensions[
    ordered_columns + [col for col in df_el2ap_dimensions.columns if col not in ordered_columns]
]

In [20]:
df_el2ap_dimensions.columns

Index(['Source', 'Discipline', 'Journal', 'Vol/Issue', 'Published', 'Title',
       'Authors', 'URL', 'DOI', 'PDF URL', 'ID', 'Text ID', 'Section',
       'Section Code', 'Paragraph', 'Paragraph Code', 'Filename',
       'Text Paragraph', 'Type/Token', 'Word Length', 'Word Count',
       'Factor 1 Score', 'Factor 2 Score', 'Factor 3 Score', 'Factor 4 Score',
       'Factor 5 Score'],
      dtype='object')

## Reorganise `AI-EL2AP`

### Drop columns

In [21]:
df_ai_el2ap_dimensions.drop(columns=['Published', 'PDF Language', 'Posted', 'Text Paragraph', 'EL2AP Filename'], inplace=True)

### Rename columns

In [22]:
df_ai_el2ap_dimensions.rename(columns={'Submitted': 'Published'}, inplace=True)

In [23]:
df_ai_el2ap_dimensions.rename(columns={'AI-EL2AP Filename': 'Filename'}, inplace=True)

In [24]:
df_ai_el2ap_dimensions.rename(columns={'Text Paragraph ChatGPT': 'Text Paragraph'}, inplace=True)

### Create columns

In [25]:
df_ai_el2ap_dimensions['Source'] = 'AI-EL2AP'

In [26]:
df_ai_el2ap_dimensions['Journal'] = 'SciELO Preprints'

In [27]:
df_ai_el2ap_dimensions['Vol/Issue'] = 'Not defined'

In [28]:
df_ai_el2ap_dimensions['DOI'] = 'Not defined'

In [29]:
df_ai_el2ap_dimensions['ID'] = 'Not defined'

### Reordering columns

In [30]:
ordered_columns = [
    'Source',
    'Discipline',
    'Journal',
    'Vol/Issue',
    'Published',
    'Title',
    'Authors',
    'URL',
    'DOI',
    'PDF URL',
    'ID',
    'Text ID',
    'Section',
    'Section Code',
    'Paragraph',
    'Paragraph Code',
    'Filename',
    'Text Paragraph'
]

In [31]:
df_ai_el2ap_dimensions = df_ai_el2ap_dimensions[
    ordered_columns + [col for col in df_ai_el2ap_dimensions.columns if col not in ordered_columns]
]

In [32]:
df_ai_el2ap_dimensions.columns

Index(['Source', 'Discipline', 'Journal', 'Vol/Issue', 'Published', 'Title',
       'Authors', 'URL', 'DOI', 'PDF URL', 'ID', 'Text ID', 'Section',
       'Section Code', 'Paragraph', 'Paragraph Code', 'Filename',
       'Text Paragraph', 'Type/Token', 'Word Length', 'Word Count',
       'Factor 1 Score', 'Factor 2 Score', 'Factor 3 Score', 'Factor 4 Score',
       'Factor 5 Score'],
      dtype='object')

## Reorganise `QJPP`

### Rename columns

In [33]:
df_qjpp_dimensions.rename(columns={'QJPP Filename': 'Filename'}, inplace=True)

### Create columns

In [34]:
df_qjpp_dimensions['Source'] = 'QJPP'

### Reordering columns

In [35]:
ordered_columns = [
    'Source',
    'Discipline',
    'Journal',
    'Vol/Issue',
    'Published',
    'Title',
    'Authors',
    'URL',
    'DOI',
    'PDF URL',
    'ID',
    'Text ID',
    'Section',
    'Section Code',
    'Paragraph',
    'Paragraph Code',
    'Filename',
    'Text Paragraph'
]

In [36]:
df_qjpp_dimensions = df_qjpp_dimensions[
    ordered_columns + [col for col in df_qjpp_dimensions.columns if col not in ordered_columns]
]

In [37]:
df_qjpp_dimensions.columns

Index(['Source', 'Discipline', 'Journal', 'Vol/Issue', 'Published', 'Title',
       'Authors', 'URL', 'DOI', 'PDF URL', 'ID', 'Text ID', 'Section',
       'Section Code', 'Paragraph', 'Paragraph Code', 'Filename',
       'Text Paragraph', 'Type/Token', 'Word Length', 'Word Count',
       'Factor 1 Score', 'Factor 2 Score', 'Factor 3 Score', 'Factor 4 Score',
       'Factor 5 Score'],
      dtype='object')

## Concatenate the DataFrames

In [38]:
df_cl_st1_eyamrog_dimensions = pd.concat([
    df_el2ap_dimensions,
    df_ai_el2ap_dimensions,
    df_qjpp_dimensions
], ignore_index=True)

### Correcting the `Linguistic, literature and arts` discipline

In [39]:
df_cl_st1_eyamrog_dimensions['Discipline'] = df_cl_st1_eyamrog_dimensions['Discipline'].replace(
    'Linguistic, literature and arts', 'Linguistics, literature and arts'
)

### Checking the DataFrame for missing values

In [40]:
df_cl_st1_eyamrog_dimensions.isna().sum()

Source            0
Discipline        0
Journal           0
Vol/Issue         0
Published         0
Title             0
Authors           0
URL               0
DOI               0
PDF URL           0
ID                0
Text ID           0
Section           0
Section Code      0
Paragraph         0
Paragraph Code    0
Filename          0
Text Paragraph    0
Type/Token        0
Word Length       0
Word Count        0
Factor 1 Score    0
Factor 2 Score    0
Factor 3 Score    0
Factor 4 Score    0
Factor 5 Score    0
dtype: int64

In [41]:
df_cl_st1_eyamrog_dimensions.dtypes

Source                    object
Discipline                object
Journal                   object
Vol/Issue                 object
Published         datetime64[ns]
Title                     object
Authors                   object
URL                       object
DOI                       object
PDF URL                   object
ID                        object
Text ID                   object
Section                   object
Section Code              object
Paragraph                 object
Paragraph Code            object
Filename                  object
Text Paragraph            object
Type/Token               float64
Word Length              float64
Word Count                 int64
Factor 1 Score           float64
Factor 2 Score           float64
Factor 3 Score           float64
Factor 4 Score           float64
Factor 5 Score           float64
dtype: object

In [42]:
df_cl_st1_eyamrog_dimensions

Unnamed: 0,Source,Discipline,Journal,Vol/Issue,Published,Title,Authors,URL,DOI,PDF URL,...,Filename,Text Paragraph,Type/Token,Word Length,Word Count,Factor 1 Score,Factor 2 Score,Factor 3 Score,Factor 4 Score,Factor 5 Score
0,EL2AP,Biological Sciences,SciELO Preprints,Not defined,2022-11-22,"(Fern flora of Viçosa, Minas Gerais State, Bra...","Nelson Túlio Lage Pena, Pedro Bond Schwartsburd",https://preprints.scielo.org/index.php/scielo/...,Not defined,https://preprints.scielo.org/index.php/scielo/...,...,t000000_s1_p1_el2ap.txt,"(Fern flora of Viçosa, Minas Gerais State, Bra...",26.3,5.8,162,-23.71,-4.13,3.95,-6.61,-3.63
1,EL2AP,Biological Sciences,SciELO Preprints,Not defined,2022-11-22,"(Fern flora of Viçosa, Minas Gerais State, Bra...","Nelson Túlio Lage Pena, Pedro Bond Schwartsburd",https://preprints.scielo.org/index.php/scielo/...,Not defined,https://preprints.scielo.org/index.php/scielo/...,...,t000000_s8_p1_el2ap.txt,At the end of the era in which the plant Class...,20.5,5.4,109,-23.11,-4.52,7.56,-2.94,8.99
2,EL2AP,Biological Sciences,SciELO Preprints,Not defined,2022-11-22,"(Fern flora of Viçosa, Minas Gerais State, Bra...","Nelson Túlio Lage Pena, Pedro Bond Schwartsburd",https://preprints.scielo.org/index.php/scielo/...,Not defined,https://preprints.scielo.org/index.php/scielo/...,...,t000000_s8_p2_el2ap.txt,Brazil is represented by Dennstaedtiaceae with...,22.0,5.2,118,-16.08,-5.73,7.47,-6.61,8.36
3,EL2AP,Biological Sciences,SciELO Preprints,Not defined,2022-11-22,"(Fern flora of Viçosa, Minas Gerais State, Bra...","Nelson Túlio Lage Pena, Pedro Bond Schwartsburd",https://preprints.scielo.org/index.php/scielo/...,Not defined,https://preprints.scielo.org/index.php/scielo/...,...,t000000_s8_p3_el2ap.txt,Pteridium is spread all across the globe (exce...,18.3,4.8,97,-2.16,-3.41,-2.12,-1.92,-0.50
4,EL2AP,Biological Sciences,SciELO Preprints,Not defined,2022-11-22,"(Fern flora of Viçosa, Minas Gerais State, Bra...","Nelson Túlio Lage Pena, Pedro Bond Schwartsburd",https://preprints.scielo.org/index.php/scielo/...,Not defined,https://preprints.scielo.org/index.php/scielo/...,...,t000000_s8_p4_el2ap.txt,"In the State of Minas Gerais, Dennstaedtiaceae...",16.8,5.2,94,-25.59,-5.38,-0.54,-6.61,4.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22779,QJPP,"Linguistics, literature and arts",Corpora,"Volume 17, Issue Supplement",2022-09-27,Learner corpus research in New Zealand,"Anna Siyanova-Chanturia, Jean Parkinson, and T...",https://www.euppublishing.com/doi/full/10.3366...,https://doi.org/10.3366/cor.2022.0250,https://www.euppublishing.com/doi/pdf/10.3366/...,...,t000299_s13_p32_qjpp.txt,"However, compared with the writers of New Zeal...",11.3,6.1,66,-30.89,-4.73,-8.50,-6.61,15.62
22780,QJPP,"Linguistics, literature and arts",Corpora,"Volume 17, Issue Supplement",2022-09-27,Learner corpus research in New Zealand,"Anna Siyanova-Chanturia, Jean Parkinson, and T...",https://www.euppublishing.com/doi/full/10.3366...,https://doi.org/10.3366/cor.2022.0250,https://www.euppublishing.com/doi/pdf/10.3366/...,...,t000299_s13_p33_qjpp.txt,Prommas’s (2020 ) finding of the lower use of ...,28.0,4.9,231,-22.72,-4.06,7.42,-4.87,1.13
22781,QJPP,"Linguistics, literature and arts",Corpora,"Volume 17, Issue Supplement",2022-09-27,Learner corpus research in New Zealand,"Anna Siyanova-Chanturia, Jean Parkinson, and T...",https://www.euppublishing.com/doi/full/10.3366...,https://doi.org/10.3366/cor.2022.0250,https://www.euppublishing.com/doi/pdf/10.3366/...,...,t000299_s13_p34_qjpp.txt,The findings of this mixed-methods study were ...,17.0,5.7,106,-30.26,-2.01,4.62,-6.61,-0.58
22782,QJPP,"Linguistics, literature and arts",Corpora,"Volume 17, Issue Supplement",2022-09-27,Learner corpus research in New Zealand,"Anna Siyanova-Chanturia, Jean Parkinson, and T...",https://www.euppublishing.com/doi/full/10.3366...,https://doi.org/10.3366/cor.2022.0250,https://www.euppublishing.com/doi/pdf/10.3366/...,...,t000299_s4_p1_qjpp.txt,"In this paper, we attempted to highlight the s...",27.5,5.2,215,-22.67,-4.20,3.79,-6.61,5.89


## Check the presence of zero-score texts in the dimensions

The presence of zero-score texts seems to be insignificant.

In [43]:
# Counting rows where 'Factor 1 Score' is equal to zero
zero_mask = df_cl_st1_eyamrog_dimensions['Factor 1 Score'] == 0
zero_count = zero_mask.sum()
zero_indexes = df_cl_st1_eyamrog_dimensions.index[zero_mask].tolist()

print(f"Rows where 'Factor 1 Score' is zero: {zero_count}")
print(f"Indexes of those rows: {zero_indexes}")

Rows where 'Factor 1 Score' is zero: 0
Indexes of those rows: []


In [44]:
# Counting rows where 'Factor 2 Score' is equal to zero
zero_mask = df_cl_st1_eyamrog_dimensions['Factor 2 Score'] == 0
zero_count = zero_mask.sum()
zero_indexes = df_cl_st1_eyamrog_dimensions.index[zero_mask].tolist()

print(f"Rows where 'Factor 2 Score' is zero: {zero_count}")
print(f"Indexes of those rows: {zero_indexes}")

Rows where 'Factor 2 Score' is zero: 3
Indexes of those rows: [2626, 9663, 14905]


In [45]:
# Counting rows where 'Factor 3 Score' is equal to zero
zero_mask = df_cl_st1_eyamrog_dimensions['Factor 3 Score'] == 0
zero_count = zero_mask.sum()
zero_indexes = df_cl_st1_eyamrog_dimensions.index[zero_mask].tolist()

print(f"Rows where 'Factor 3 Score' is zero: {zero_count}")
print(f"Indexes of those rows: {zero_indexes}")

Rows where 'Factor 3 Score' is zero: 11
Indexes of those rows: [248, 262, 4004, 4633, 5208, 8082, 18772, 18932, 19392, 22183, 22343]


In [46]:
# Counting rows where 'Factor 4 Score' is equal to zero
zero_mask = df_cl_st1_eyamrog_dimensions['Factor 4 Score'] == 0
zero_count = zero_mask.sum()
zero_indexes = df_cl_st1_eyamrog_dimensions.index[zero_mask].tolist()

print(f"Rows where 'Factor 4 Score' is zero: {zero_count}")
print(f"Indexes of those rows: {zero_indexes}")

Rows where 'Factor 4 Score' is zero: 1
Indexes of those rows: [1399]


In [47]:
# Counting rows where 'Factor 5 Score' is equal to zero
zero_mask = df_cl_st1_eyamrog_dimensions['Factor 5 Score'] == 0
zero_count = zero_mask.sum()
zero_indexes = df_cl_st1_eyamrog_dimensions.index[zero_mask].tolist()

print(f"Rows where 'Factor 5 Score' is zero: {zero_count}")
print(f"Indexes of those rows: {zero_indexes}")

Rows where 'Factor 5 Score' is zero: 6
Indexes of those rows: [5250, 6053, 10475, 14952, 19254, 22162]


### Exporting to files

In [48]:
df_cl_st1_eyamrog_dimensions.to_json(f"{output_directory}/df_cl_st1_eyamrog_dimensions.jsonl", orient='records', lines=True)

In [49]:
df_cl_st1_eyamrog_dimensions.to_csv(f"{output_directory}/df_cl_st1_eyamrog_dimensions.tsv", sep='\t', index=False, encoding='utf-8', lineterminator='\n')

In [50]:
df_cl_st1_eyamrog_dimensions.to_excel(f"{output_directory}/df_cl_st1_eyamrog_dimensions.xlsx")