In [2]:
import pandas as pd

# Read the text file
with open('Sundheddk_all_text.txt', 'r', encoding='utf-8') as file:
    data = file.read()

# Combine lines into paragraphs and split articles based on empty lines
articles = data.split('\n\n')

# Create lists to hold extracted data
source_list = []
header_list = []
text_list = []
summary_list = []

# Extract required information for each article
for article in articles:
    lines = article.split('\n')
    source = "SundhedDK"
    header = lines[0].strip()
    summary = ""
    text = ""

    # Find the start and end of the summary
    start_index = -1
    end_index = -1
    in_summary = False
    for i, line in enumerate(lines):
        if line.startswith('Fakta') and not in_summary:
            in_summary = True
            start_index = i + 1
        elif line.startswith('Hvad er') and in_summary:
            end_index = i - 1
            in_summary = False

    # Extract summary and text
    if start_index != -1 and end_index != -1:
        summary = '\n\n'.join(lines[start_index:end_index + 1])
        text = '\n'.join(lines[end_index + 2:]) if end_index + 2 < len(lines) else ''

        # Append data to respective lists
        source_list.append(source)
        header_list.append(header)
        text_list.append(text)
        summary_list.append(summary)

# Create a pandas dataframe
df = pd.DataFrame({
    'Source': source_list,
    'Header': header_list,
    'Text': text_list,
    'Summary': summary_list
})


In [47]:
print(df['Summary'])

0       Hæmofili og von Willebrands sygdom er arvelige...
1       Atelektase betyder lufttomt lungevævog opstår,...
2       Overaktiv urinblære (OAB) medfører hyppige til...
3       Sollys indeholder ultraviolette (UV) stråler, ...
4       Åreknuder er udvidelser af de overfladiske blo...
                              ...                        
1923    Bevidsthedssvækkelse er, når en person kan vær...
1924    Reyes syndrom er en meget sjælden, men alvorli...
1925    Tuberkulose i ryggen er en sjælden sygdom i Da...
1926    Hvis man har truende lavt stofskifte, har man ...
1927    Hørenedsættelse betyder nedsat eller manglende...
Name: Summary, Length: 1928, dtype: object


In [32]:
df.to_csv('more_summs.csv', index=False)

In [48]:
df.to_excel('more_summs_third.xlsx', index=False)

In [3]:
# Load the first Excel file into a pandas DataFrame
df1 = pd.read_excel('more_summs_Firstmod_Updated.xlsx')

df1.columns

# Drop the columns that are not needed
df2 = pd.read_excel('Resumes_without_doubles_Final.xlsx')

# Remove the last column
df2 = df2.iloc[:, :-1]

# Concatenate both DataFrames
combined_df = pd.concat([df2, df1])

# Drop rows if the Summary column is empty in any of the rows
combined_df = combined_df.dropna(subset=['Summary'])

# Remove duplicates based on the 'Title' column, keeping the first occurrence
final_df = combined_df.drop_duplicates(subset='title', keep='first').reset_index(drop=True)

In [4]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2340 entries, 0 to 2339
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   source   2340 non-null   object
 1   title    2340 non-null   object
 2   text     2337 non-null   object
 3   Summary  2340 non-null   object
dtypes: object(4)
memory usage: 73.2+ KB


In [5]:
final_df.to_excel('more_summs_final.xlsx', index=False)