In [115]:
import re
import os
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [116]:
def extract(content):
    """
    Extracts the contents between <TITLE>...</TITLE> and <TEXT>...</TEXT>
    tags and returns the concatenated data.
    """
    content = re.sub('\n', ' ', content)

    title = re.search('<TITLE>(.*?)</TITLE>', content).group(1)

    text = re.search('<TEXT>(.*?)</TEXT>', content).group(1)

    data = title + " " + text

    return data

In [117]:
def read_file(directory = "CSE508_Winter2023_Dataset"):
    data = {}
    for filename in os.listdir(directory):    
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as f:
            file_contents = f.read()
        data[filename] = file_contents
    return data

In [118]:
data = read_file()
print("File count = ",len(data))

File count =  1400


In [119]:
def extract_data(data):
    extracted_data = {}
    for i in data.keys():
        extracted_data[i] = extract(data[i])
    return extracted_data

In [120]:
data = extract_data(data)

In [121]:
print(len(data))

1400


In [122]:
def save_data(data, directory_path):
    """
    Saves data from a dictionary to files, where the keys are the file names and the values are the contents of the files.
    """
    os.makedirs(directory_path, exist_ok=True)

    for filename, file_contents in data.items():
        file_path = os.path.join(directory_path, filename)
        
        with open(file_path, 'w') as f:
            f.write(file_contents)

In [123]:
directory_path = "CSE508_Winter2023_Dataset_Processed"
save_data(data,directory_path)

In [124]:
df = pd.DataFrame(list(data.items()),columns = ['file_name','raw_data'])

In [125]:
df['TEXT_LOWER']=df['raw_data'].str.lower()

In [126]:
df.head()

Unnamed: 0,file_name,raw_data,TEXT_LOWER
0,cranfield0001,experimental investigation of the aerodynamic...,experimental investigation of the aerodynamic...
1,cranfield0002,simple shear flow past a flat plate in an inc...,simple shear flow past a flat plate in an inc...
2,cranfield0003,the boundary layer in simple shear flow past ...,the boundary layer in simple shear flow past ...
3,cranfield0004,approximate solutions of the incompressible l...,approximate solutions of the incompressible l...
4,cranfield0005,one-dimensional transient heat conduction int...,one-dimensional transient heat conduction int...


In [127]:
df["tokenization"] = df["TEXT_LOWER"].apply(nltk.word_tokenize)

In [128]:
df.head()

Unnamed: 0,file_name,raw_data,TEXT_LOWER,tokenization
0,cranfield0001,experimental investigation of the aerodynamic...,experimental investigation of the aerodynamic...,"[experimental, investigation, of, the, aerodyn..."
1,cranfield0002,simple shear flow past a flat plate in an inc...,simple shear flow past a flat plate in an inc...,"[simple, shear, flow, past, a, flat, plate, in..."
2,cranfield0003,the boundary layer in simple shear flow past ...,the boundary layer in simple shear flow past ...,"[the, boundary, layer, in, simple, shear, flow..."
3,cranfield0004,approximate solutions of the incompressible l...,approximate solutions of the incompressible l...,"[approximate, solutions, of, the, incompressib..."
4,cranfield0005,one-dimensional transient heat conduction int...,one-dimensional transient heat conduction int...,"[one-dimensional, transient, heat, conduction,..."


In [129]:
def remove_stop_words(tokens):
    stop_words = set(stopwords.words('english'))

    tokens_clean = [token for token in tokens if token not in stop_words]
    return tokens_clean

In [130]:
df["remove_stop_word"] = df["tokenization"].apply(remove_stop_words)

In [131]:
df.head()

Unnamed: 0,file_name,raw_data,TEXT_LOWER,tokenization,remove_stop_word
0,cranfield0001,experimental investigation of the aerodynamic...,experimental investigation of the aerodynamic...,"[experimental, investigation, of, the, aerodyn...","[experimental, investigation, aerodynamics, wi..."
1,cranfield0002,simple shear flow past a flat plate in an inc...,simple shear flow past a flat plate in an inc...,"[simple, shear, flow, past, a, flat, plate, in...","[simple, shear, flow, past, flat, plate, incom..."
2,cranfield0003,the boundary layer in simple shear flow past ...,the boundary layer in simple shear flow past ...,"[the, boundary, layer, in, simple, shear, flow...","[boundary, layer, simple, shear, flow, past, f..."
3,cranfield0004,approximate solutions of the incompressible l...,approximate solutions of the incompressible l...,"[approximate, solutions, of, the, incompressib...","[approximate, solutions, incompressible, lamin..."
4,cranfield0005,one-dimensional transient heat conduction int...,one-dimensional transient heat conduction int...,"[one-dimensional, transient, heat, conduction,...","[one-dimensional, transient, heat, conduction,..."


In [132]:
def remove_punctuation(tokens):
    punctuation = string.punctuation
    lst = []
    for i in range(len(tokens)):
        temp = ""
        for j in tokens[i]:
            if j not in punctuation:
                temp+=j
        tokens[i] = temp
    print(tokens)
    # token_clean = [token for token in tokens if token not in punctuation]
    #     # temp = ""
    # print(token_clean)
        # for t in token_clean:
        #     temp+=t
        # print(temp)
        # lst.append(temp)
        
    return tokens

In [133]:
df["remove_punctuation"] = df["remove_stop_word"].apply(remove_punctuation)

['experimental', 'investigation', 'aerodynamics', 'wing', 'slipstream', '', 'experimental', 'study', 'wing', 'propeller', 'slipstream', 'made', 'order', 'determine', 'spanwise', 'distribution', 'lift', 'increase', 'due', 'slipstream', 'different', 'angles', 'attack', 'wing', 'different', 'free', 'stream', 'slipstream', 'velocity', 'ratios', '', 'results', 'intended', 'part', 'evaluation', 'basis', 'different', 'theoretical', 'treatments', 'problem', '', 'comparative', 'span', 'loading', 'curves', '', 'together', 'supporting', 'evidence', '', 'showed', 'substantial', 'part', 'lift', 'increment', 'produced', 'slipstream', 'due', 'destalling', 'boundarylayercontrol', 'effect', '', 'integrated', 'remaining', 'lift', 'increment', '', 'subtracting', 'destalling', 'lift', '', 'found', 'agree', 'well', 'potential', 'flow', 'theory', '', 'empirical', 'evaluation', 'destalling', 'effects', 'made', 'specific', 'configuration', 'experiment', '']
['simple', 'shear', 'flow', 'past', 'flat', 'plate',

In [134]:
df.head()

Unnamed: 0,file_name,raw_data,TEXT_LOWER,tokenization,remove_stop_word,remove_punctuation
0,cranfield0001,experimental investigation of the aerodynamic...,experimental investigation of the aerodynamic...,"[experimental, investigation, of, the, aerodyn...","[experimental, investigation, aerodynamics, wi...","[experimental, investigation, aerodynamics, wi..."
1,cranfield0002,simple shear flow past a flat plate in an inc...,simple shear flow past a flat plate in an inc...,"[simple, shear, flow, past, a, flat, plate, in...","[simple, shear, flow, past, flat, plate, incom...","[simple, shear, flow, past, flat, plate, incom..."
2,cranfield0003,the boundary layer in simple shear flow past ...,the boundary layer in simple shear flow past ...,"[the, boundary, layer, in, simple, shear, flow...","[boundary, layer, simple, shear, flow, past, f...","[boundary, layer, simple, shear, flow, past, f..."
3,cranfield0004,approximate solutions of the incompressible l...,approximate solutions of the incompressible l...,"[approximate, solutions, of, the, incompressib...","[approximate, solutions, incompressible, lamin...","[approximate, solutions, incompressible, lamin..."
4,cranfield0005,one-dimensional transient heat conduction int...,one-dimensional transient heat conduction int...,"[one-dimensional, transient, heat, conduction,...","[onedimensional, transient, heat, conduction, ...","[onedimensional, transient, heat, conduction, ..."


In [135]:
def remove_space(tokens):
    token_clean = [token for token in tokens if token.strip()]
    return token_clean

In [136]:
df["remove_space"] = df["remove_punctuation"].apply(remove_space)

In [137]:
df.head()

Unnamed: 0,file_name,raw_data,TEXT_LOWER,tokenization,remove_stop_word,remove_punctuation,remove_space
0,cranfield0001,experimental investigation of the aerodynamic...,experimental investigation of the aerodynamic...,"[experimental, investigation, of, the, aerodyn...","[experimental, investigation, aerodynamics, wi...","[experimental, investigation, aerodynamics, wi...","[experimental, investigation, aerodynamics, wi..."
1,cranfield0002,simple shear flow past a flat plate in an inc...,simple shear flow past a flat plate in an inc...,"[simple, shear, flow, past, a, flat, plate, in...","[simple, shear, flow, past, flat, plate, incom...","[simple, shear, flow, past, flat, plate, incom...","[simple, shear, flow, past, flat, plate, incom..."
2,cranfield0003,the boundary layer in simple shear flow past ...,the boundary layer in simple shear flow past ...,"[the, boundary, layer, in, simple, shear, flow...","[boundary, layer, simple, shear, flow, past, f...","[boundary, layer, simple, shear, flow, past, f...","[boundary, layer, simple, shear, flow, past, f..."
3,cranfield0004,approximate solutions of the incompressible l...,approximate solutions of the incompressible l...,"[approximate, solutions, of, the, incompressib...","[approximate, solutions, incompressible, lamin...","[approximate, solutions, incompressible, lamin...","[approximate, solutions, incompressible, lamin..."
4,cranfield0005,one-dimensional transient heat conduction int...,one-dimensional transient heat conduction int...,"[one-dimensional, transient, heat, conduction,...","[onedimensional, transient, heat, conduction, ...","[onedimensional, transient, heat, conduction, ...","[onedimensional, transient, heat, conduction, ..."


In [138]:
data_new = {}
count = 0
for i in df["remove_space"]:
    # df['id'].loc[data.index[0]]
    temp = ""
    count+=1
    for j in i:
        temp+=j+" "
    data_new[df['file_name'].loc[df.index[count-1]]] = temp
    


In [139]:
directory_path = "CSE508_Winter2023_Dataset_Processed_new"
save_data(data_new,directory_path)

In [81]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [83]:
remove_punctuation("'s")

['s']


['s']