In [2]:
import re
import os
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def extract(content):
    """
    Extracts the contents between <TITLE>...</TITLE> and <TEXT>...</TEXT>
    tags and returns the concatenated data.
    """
    content = re.sub('\n', '', content)

    title = re.search('<TITLE>(.*?)</TITLE>', content).group(1)

    text = re.search('<TEXT>(.*?)</TEXT>', content).group(1)

    data = title + " " + text

    return data

In [4]:
def read_file(directory = "CSE508_Winter2023_Dataset"):
    data = {}
    for filename in os.listdir(directory):    
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as f:
            file_contents = f.read()
        data[filename] = file_contents
    return data

In [5]:
data = read_file()
print("File count = ",len(data))

File count =  1400


In [6]:
def extract_data(data):
    extracted_data = {}
    for i in data.keys():
        extracted_data[i] = extract(data[i])
    return extracted_data

In [7]:
data = extract_data(data)

In [8]:
print(len(data))

1400


In [9]:
def save_data(data, directory_path):
    """
    Saves data from a dictionary to files, where the keys are the file names and the values are the contents of the files.
    """
    os.makedirs(directory_path, exist_ok=True)

    for filename, file_contents in data.items():
        file_path = os.path.join(directory_path, filename)
        
        with open(file_path, 'w') as f:
            f.write(file_contents)

In [10]:
directory_path = "CSE508_Winter2023_Dataset_Processed"
save_data(data,directory_path)

In [11]:
df = pd.DataFrame(list(data.items()),columns = ['file_name','raw_data'])

In [12]:
df['TEXT_LOWER']=df['raw_data'].str.lower()

In [13]:
df.head()

Unnamed: 0,file_name,raw_data,TEXT_LOWER
0,cranfield0001,experimental investigation of the aerodynamics...,experimental investigation of the aerodynamics...
1,cranfield0002,simple shear flow past a flat plate in an inco...,simple shear flow past a flat plate in an inco...
2,cranfield0003,the boundary layer in simple shear flow past a...,the boundary layer in simple shear flow past a...
3,cranfield0004,approximate solutions of the incompressible la...,approximate solutions of the incompressible la...
4,cranfield0005,one-dimensional transient heat conduction into...,one-dimensional transient heat conduction into...


In [14]:
df["tokenization"] = df["TEXT_LOWER"].apply(nltk.word_tokenize)

In [15]:
df.head()

Unnamed: 0,file_name,raw_data,TEXT_LOWER,tokenization
0,cranfield0001,experimental investigation of the aerodynamics...,experimental investigation of the aerodynamics...,"[experimental, investigation, of, the, aerodyn..."
1,cranfield0002,simple shear flow past a flat plate in an inco...,simple shear flow past a flat plate in an inco...,"[simple, shear, flow, past, a, flat, plate, in..."
2,cranfield0003,the boundary layer in simple shear flow past a...,the boundary layer in simple shear flow past a...,"[the, boundary, layer, in, simple, shear, flow..."
3,cranfield0004,approximate solutions of the incompressible la...,approximate solutions of the incompressible la...,"[approximate, solutions, of, the, incompressib..."
4,cranfield0005,one-dimensional transient heat conduction into...,one-dimensional transient heat conduction into...,"[one-dimensional, transient, heat, conduction,..."


In [16]:
def remove_stop_words(tokens):
    stop_words = set(stopwords.words('english'))

    tokens_clean = [token for token in tokens if token not in stop_words]
    return tokens_clean

In [17]:
df["remove_stop_word"] = df["tokenization"].apply(remove_stop_words)

In [18]:
df.head()

Unnamed: 0,file_name,raw_data,TEXT_LOWER,tokenization,remove_stop_word
0,cranfield0001,experimental investigation of the aerodynamics...,experimental investigation of the aerodynamics...,"[experimental, investigation, of, the, aerodyn...","[experimental, investigation, aerodynamics, aw..."
1,cranfield0002,simple shear flow past a flat plate in an inco...,simple shear flow past a flat plate in an inco...,"[simple, shear, flow, past, a, flat, plate, in...","[simple, shear, flow, past, flat, plate, incom..."
2,cranfield0003,the boundary layer in simple shear flow past a...,the boundary layer in simple shear flow past a...,"[the, boundary, layer, in, simple, shear, flow...","[boundary, layer, simple, shear, flow, past, f..."
3,cranfield0004,approximate solutions of the incompressible la...,approximate solutions of the incompressible la...,"[approximate, solutions, of, the, incompressib...","[approximate, solutions, incompressible, lamin..."
4,cranfield0005,one-dimensional transient heat conduction into...,one-dimensional transient heat conduction into...,"[one-dimensional, transient, heat, conduction,...","[one-dimensional, transient, heat, conduction,..."


In [19]:
def remove_punctuation(tokens):
    punctuation = string.punctuation
    token_clean = [token for token in tokens if token not in punctuation]
    return token_clean

In [20]:
df["remove_punctuation"] = df["remove_stop_word"].apply(remove_punctuation)

In [21]:
df.head()

Unnamed: 0,file_name,raw_data,TEXT_LOWER,tokenization,remove_stop_word,remove_punctuation
0,cranfield0001,experimental investigation of the aerodynamics...,experimental investigation of the aerodynamics...,"[experimental, investigation, of, the, aerodyn...","[experimental, investigation, aerodynamics, aw...","[experimental, investigation, aerodynamics, aw..."
1,cranfield0002,simple shear flow past a flat plate in an inco...,simple shear flow past a flat plate in an inco...,"[simple, shear, flow, past, a, flat, plate, in...","[simple, shear, flow, past, flat, plate, incom...","[simple, shear, flow, past, flat, plate, incom..."
2,cranfield0003,the boundary layer in simple shear flow past a...,the boundary layer in simple shear flow past a...,"[the, boundary, layer, in, simple, shear, flow...","[boundary, layer, simple, shear, flow, past, f...","[boundary, layer, simple, shear, flow, past, f..."
3,cranfield0004,approximate solutions of the incompressible la...,approximate solutions of the incompressible la...,"[approximate, solutions, of, the, incompressib...","[approximate, solutions, incompressible, lamin...","[approximate, solutions, incompressible, lamin..."
4,cranfield0005,one-dimensional transient heat conduction into...,one-dimensional transient heat conduction into...,"[one-dimensional, transient, heat, conduction,...","[one-dimensional, transient, heat, conduction,...","[one-dimensional, transient, heat, conduction,..."


In [22]:
def remove_space(tokens):
    token_clean = [token for token in tokens if token.strip()]
    return token_clean

In [23]:
df["remove_space"] = df["remove_punctuation"].apply(remove_space)

In [24]:
df.head()

Unnamed: 0,file_name,raw_data,TEXT_LOWER,tokenization,remove_stop_word,remove_punctuation,remove_space
0,cranfield0001,experimental investigation of the aerodynamics...,experimental investigation of the aerodynamics...,"[experimental, investigation, of, the, aerodyn...","[experimental, investigation, aerodynamics, aw...","[experimental, investigation, aerodynamics, aw...","[experimental, investigation, aerodynamics, aw..."
1,cranfield0002,simple shear flow past a flat plate in an inco...,simple shear flow past a flat plate in an inco...,"[simple, shear, flow, past, a, flat, plate, in...","[simple, shear, flow, past, flat, plate, incom...","[simple, shear, flow, past, flat, plate, incom...","[simple, shear, flow, past, flat, plate, incom..."
2,cranfield0003,the boundary layer in simple shear flow past a...,the boundary layer in simple shear flow past a...,"[the, boundary, layer, in, simple, shear, flow...","[boundary, layer, simple, shear, flow, past, f...","[boundary, layer, simple, shear, flow, past, f...","[boundary, layer, simple, shear, flow, past, f..."
3,cranfield0004,approximate solutions of the incompressible la...,approximate solutions of the incompressible la...,"[approximate, solutions, of, the, incompressib...","[approximate, solutions, incompressible, lamin...","[approximate, solutions, incompressible, lamin...","[approximate, solutions, incompressible, lamin..."
4,cranfield0005,one-dimensional transient heat conduction into...,one-dimensional transient heat conduction into...,"[one-dimensional, transient, heat, conduction,...","[one-dimensional, transient, heat, conduction,...","[one-dimensional, transient, heat, conduction,...","[one-dimensional, transient, heat, conduction,..."


In [25]:
data_new = {}
for i in df["remove_space"]:
    df['id'].loc[data.index[0]]
    temp = ""
    for j in i:
        temp+=j+" "
    print(temp)


experimental investigation aerodynamics awing slipstream experimental study wing propeller slipstream wasmade order determine spanwise distribution liftincrease due slipstream different angles attack wingand different free stream slipstream velocity ratios theresults intended part evaluation basis differenttheoretical treatments problem comparative span loading curves together supportingevidence showed substantial part lift incrementproduced slipstream due /destalling/ boundary-layer-controleffect integrated remaining lift increment subtracting destalling lift found agreewell potential flow theory empirical evaluation destalling effects made forthe specific configuration experiment 
simple shear flow past flat plate incompressible fluid smallviscosity study high-speed viscous flow past two-dimensional body itis usually necessary consider curved shock wave emitting thenose leading edge body consequently exists inviscidrotational flow region shock wave boundary layer situation arises ins