In [None]:
import pandas as pd

In [None]:
# data_path = "../data/data_investeringer.xlsx"
data_path = "../data/investeringer_datagrundlag.xlsx"
df = pd.read_excel(data_path)

In [None]:
# Filter the rows where "Kommune" is missing
missing_kommune_rows = df[df['Kommune'].isna()]

# Display the rows
print(missing_kommune_rows)

df = df[df['Kommune'].notna()]

In [None]:
import numpy as np

# Replace '-' with NaN (Fjerner dem, hvor der ikke er værdi. Det er fx to fra Odense)
df['Markedsværdi (DKK)'] = df['Markedsværdi (DKK)'].replace('-', np.nan)

# Remove any potential commas, spaces, or other non-numeric characters
df['Markedsværdi (DKK)'] = df['Markedsværdi (DKK)'].replace({',': '', ' ': ''}, regex=True)

# Convert the column to float
df['Markedsværdi (DKK)'] = df['Markedsværdi (DKK)'].astype(float)


In [None]:
df.head()

In [None]:
import pandas as pd

# Create the function to fill missing 'Type' based on the majority for each 'ISIN kode'
def fill_missing_type(df, min_rows=5, agree_threshold=0.8):
    def fill_type_for_group(group):
        # Count the missing values in 'Type' for this group
        missing_count = group['Type'].isna().sum()
        #print(f"ISIN kode: {group.name}, Missing 'Type' values: {missing_count}")
        
        # Get the count of each type in the group, excluding missing values
        type_counts = group['Type'].value_counts()
        
        # If there are no valid types in the group, skip this group
        if type_counts.empty:
            return group
        
        total_rows = len(group)
        most_common_type, most_common_count = type_counts.idxmax(), type_counts.max()
        
        # Check the condition: at least min_rows, and agreement should meet the threshold
        if total_rows >= min_rows and most_common_count / total_rows >= agree_threshold:
            # If conditions met, fill missing 'Type' with the most common type
            group['Type'] = group['Type'].fillna(most_common_type)
        
        return group

    # Group by 'ISIN kode' and apply the function to each group
    df = df.groupby('ISIN kode').apply(fill_type_for_group)

    return df

# Print the number of missing values in 'Type' before applying the function
missing_before = df['Type'].isna().sum()
print(f"Missing 'Type' values before: {missing_before}")

# Apply the function to fill missing 'Type' values
filled_df = fill_missing_type(df, min_rows=5, agree_threshold=0.80)

# Print the number of missing values in 'Type' after applying the function
missing_after = filled_df['Type'].isna().sum()
print(f"Missing 'Type' values after: {missing_after}")

filled_df.head()


In [None]:
df_lav = df[df['Markedsværdi (DKK)'] < 100]
df_lav = df_lav[df_lav['Markedsværdi (DKK)'] >= 0]
df_lav

In [None]:
sum(df_lav['Markedsværdi (DKK)'])

In [None]:
df.value_counts('Kommune')

In [None]:
text = df[df['Kommune'] == 'Guldborgsund'].to_string(index=False)

In [None]:
!pip install tiktoken

In [None]:
import tiktoken
import pandas as pd

# Function to convert dataframe to text and count tokens
def dataframe_to_text(df):
    """
    This function converts a dataframe into a text format.
    You can modify it based on how you want to present the data.
    """
    text = df.to_string(index=False)  # Convert DataFrame to string (or JSON if needed)
    return text

# Function to count tokens using OpenAI's tokenizer (tiktoken)
def count_tokens(text, model="gpt-4"):
    """
    Counts the number of tokens in a given text based on the OpenAI model.
    """
    # Load the encoding for the specified model (gpt-3.5-turbo, gpt-4, etc.)
    encoding = tiktoken.encoding_for_model(model)
    
    # Encode the text into tokens
    tokens = encoding.encode(text)
    
    # Return the number of tokens
    return len(tokens)

# Convert dataframe to text
df_text = dataframe_to_text(df['Markedsværdi (DKK)'][df['Kommune'] == 'Guldborgsund'])

# Count tokens
num_tokens = count_tokens(df_text, model="gpt-4")

# print(f"DataFrame text:\n{df_text}")
print(f"\nNumber of tokens: {num_tokens}")