# File operations for comparing BERT and GPT results (initial stages)

In [None]:
import pandas as pd

# Read the CSV files
twitter_df = pd.read_csv("Ingeborg-Bachmann-Preis_Twitter_Decryption.csv", sep=";")
outputs_df = pd.read_csv("twitter_lore2023/Ingeborg-Bachmann-Preis_Twitter_2017.csv")

# Merge the dataframes on the Filename column
merged_df = pd.merge(twitter_df, outputs_df, on="Filename")

# Group by Filename and aggregate values into a dictionary
grouped_df = merged_df.groupby("Filename").agg({
    "Aspect": list,
    "Polarity": list,
    "Category_Coarse": list,
    "Category_Fine": list
}).reset_index()

# Combine the aggregated values into a single JSON column
grouped_df["results"] = grouped_df.apply(lambda x: {
    "Aspect": x["Aspect"],
    "Polarity": x["Polarity"],
    "Category_Coarse": x["Category_Coarse"],
    "Category_Fine": x["Category_Fine"]
}, axis=1)

# Drop the original columns
grouped_df.drop(columns=["Aspect", "Polarity", "Category_Coarse", "Category_Fine"], inplace=True)

# Print the resulting dataframe
print(grouped_df)

In [2]:
# Set the display option to show the full text in columns
pd.set_option("display.max_colwidth", None)

In [1]:
#definitiefwerkt13:26 op 23/5/2023
import pandas as pd
import json

# Read the CSV files with semicolon separator
twitter_df = pd.read_csv("Ingeborg-Bachmann-Preis_Twitter_Decryption.csv", sep=";")
outputs_df = pd.read_csv("twitter_lore2023/Ingeborg-Bachmann-Preis_Twitter_2017.csv")

# Merge the dataframes on the Filename column
merged_df = pd.merge(twitter_df, outputs_df, on="Filename")

# Group by Filename and aggregate values into a dictionary
grouped_df = merged_df.groupby("Filename").agg({
    "Caption": lambda x: x.iloc[0],  # Retrieve the first Caption for each Filename
    "Aspect": list,
    "Polarity": list,
    "Category_Coarse": list,
    "Category_Fine": list,
    "TweetID": lambda x: x.iloc[0]  # Retrieve the first TweetID for each Filename
}).reset_index()

# Read the CSV file
df = pd.read_csv("twitter_lore2023/Ingeborg-Bachmann-Preis_Twitter_2017.csv")

# Group by Filename, Caption, and aggregate values into a dictionary
grouped_df_new = df.groupby(["Filename", "Caption"]).agg({
    "Aspect": list,
    "Polarity": list,
    "Category_Coarse": list,
    "Category_Fine": list
}).reset_index()

# Create a list to store the JSON objects
json_list = []

# Iterate over the rows of grouped_df_new and create JSON objects for matching Filenames
for _, row in grouped_df_new.iterrows():
    filename = row["Filename"]
    caption = row["Caption"]
    if filename in grouped_df["Filename"].values:
        aspects = row["Aspect"]
        polarities = row["Polarity"]
        categories_coarse = row["Category_Coarse"]
        categories_fine = row["Category_Fine"]
        tweet_id = grouped_df.loc[grouped_df["Filename"] == filename, "TweetID"].iloc[0]  # Get the TweetID for the matching Filename

        # Iterate over the lists and create JSON objects
        for aspect, polarity, category_coarse, category_fine in zip(aspects, polarities, categories_coarse, categories_fine):
            json_obj = {
                "aspect category": category_coarse,
                "aspect term": aspect,
                "opinion target": caption,
                "opinion expression": polarity,
                "sentiment": polarity.lower(),
                "confidence score": "-1"
            }
            json_list.append(json_obj)

# Convert the list of JSON objects to JSON string
json_str = json.dumps(json_list)

# Print the JSON string
#print(json_str)

grouped_df["results"] = grouped_df.apply(lambda x: [
    {
        "aspect category": x["Category_Coarse"][i],
        "aspect term": x["Category_Fine"][i],
        "aspect content": x["Aspect"][i],
       "opinion target": x["Caption"],
#        "opinion expression": x["Polarity"][i],
        "sentiment": x["Polarity"][i].lower(),
#        "confidence score": x["Confidence_Score"][i]  # Extract the confidence score from the corresponding column
    }
    for i in range(len(x["Aspect"]))
], axis=1)

# Include the TweetID as a separate column in grouped_df
grouped_df.drop(columns=["Aspect", "Polarity", "Category_Coarse", "Category_Fine"], inplace=True)
# Include the TweetID as a separate column in grouped_df
grouped_df["TweetID"] = grouped_df["Filename"].map(dict(zip(twitter_df["Filename"], twitter_df["TweetID"])))

# Display the first row of the grouped_df dataframe
display(grouped_df.head(1))

Unnamed: 0,Filename,Caption,TweetID,results
0,TDDL_Twitter_2017_1,I liked a @YouTube video http://youtu.be/2AAbk...,8.164146e+17,"[{'aspect category': 'META', 'aspect term': 'M..."


In [3]:
display(grouped_df.head(1))

Unnamed: 0,Filename,Caption,TweetID,results
0,TDDL_Twitter_2017_1,I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016,8.164146e+17,"[{'aspect category': 'META', 'aspect term': 'META_Technology_Social-Media', 'aspect content': 'YouTube', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}, {'aspect category': 'META', 'aspect term': 'META_Technology_Social-Media', 'aspect content': 'video', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}, {'aspect category': 'META', 'aspect term': 'META_Technology_Social-Media', 'aspect content': 'http', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}, {'aspect category': 'CONTENDER', 'aspect term': 'CONTENDER_General', 'aspect content': 'Stefanie', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}, {'aspect category': 'CONTENDER', 'aspect term': 'CONTENDER_General', 'aspect content': ' Sargnagel', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}, {'aspect category': 'META', 'aspect term': 'CONTENDER_General', 'aspect content': 'Videoporträt', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}, {'aspect category': 'META', 'aspect term': 'META_Videoportrait', 'aspect content': ' Bachmannpreis', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}]"


In [5]:
#deze werkt 25/04.2023 9u41
import re
import pandas as pd
import json

# Define a function to extract feedback information from the "analysis" column
def extract_feedback_info(row):
    # try to load the JSON string in the analysis column
    try:
        categories = json.loads(row["analysis"])
    except json.JSONDecodeError as e:
        # if an error occurs, print a warning message and return an empty dictionary
        print(f"JSON decode error in row {row.name}: {e}")
        return {}

    # return the feedback_data dictionary
    return categories

# Read in the feedbacks and outputs CSV files
feedbacks_df = pd.read_csv("feedbacks_analysis_2017_all.csv")
outputs_df = grouped_df

# Define a function to reformat scientific notation to desired format
def reformat_scientific_notation(title):
    if isinstance(title, (str, bytes)):
        match = re.search(r'(\d+(\.\d+)?)[eE]([-+]?\d+)', title)
        if match:
            number = match.group(1)
            exponent = match.group(3)
            return f"{number[:16]}E{exponent.zfill(3)}"
        else:
            return title
    else:
        return title

# Apply the function to the "title" column in the outputs dataframe
outputs_df["TweetID"] = outputs_df["TweetID"].astype(str)
feedbacks_df["title"] = feedbacks_df["title"].astype(str)
#feedbacks_df["title"] = feedbacks_df["title"].apply(reformat_scientific_notation)

# Replace single quotes with double quotes in the analysis column
feedbacks_df["analysis"] = feedbacks_df["analysis"].str.replace("'", '"')

# Replace backslashes with double backslashes in the analysis column
feedbacks_df["analysis"] = feedbacks_df["analysis"].str.replace("\\", "\\\\")

# Remove control characters from the analysis column
feedbacks_df["analysis"] = feedbacks_df["analysis"].str.replace(r'[\x00-\x1F]+', '', regex=True)

# Apply the function to the feedbacks dataframe to create a new column
feedbacks_df["categories"] = feedbacks_df.apply(extract_feedback_info, axis=1)
#outputs_df["TweetID"] = outputs_df["TweetID"].astype(str)
#make sure that the E in title notations is also capitalized by using the lambda on feedbacks_df["title"] = feedbacks_df["title"].apply(lambda x: '{:.15g}'.format(float(x)))
outputs_df["TweetID"] = outputs_df["TweetID"].astype(str).apply(lambda x: x.replace("e", "E") if "e" in x else x)


feedbacks_df["title"] = feedbacks_df["title"].astype(str)
# Convert "title" column to string data type
#feedbacks_df["title"] = feedbacks_df["title"].astype(str)
feedbacks_df["title"] = feedbacks_df["title"].apply(lambda x: '{:.15g}'.format(float(x)).replace("e", "E"))

#I want you to cut down short scientific number in "title" like 8.828872388116152e+17 one number before the "e+" so that it matches TweetID: 8.82887238811615e+17 via feedbacks_df["title"] = feedbacks_df["title"].astype(str)
# Shorten the scientific notation in "title" column
#feedbacks_df["title"] = feedbacks_df["title"].apply(lambda x: '{:.15g}'.format(float(x)))
#The '{:.15g}'.format(float(x)) function formats the number to have 15 significant digits and then shortens the scientific notation to one number before the "e+".
feedbacks_df["title"] = feedbacks_df["title"].apply(lambda x: '{:.15g}'.format(float(x)).replace("e", "E"))
# Merge
# Merge the outputs dataframe with the feedbacks dataframe on the "title" and "TwitterID" columns
merged_df = pd.merge(outputs_df, feedbacks_df[["title", "categories"]], left_on=outputs_df["TweetID"].str[:16], right_on=feedbacks_df["title"].str[:16], how="left")


# Merge
#Merge the outputs dataframe with the feedbacks dataframe on the "title" and "TwitterID" columns
#merged_df = pd.merge(outputs_df, feedbacks_df[["title", "categories"]], left_on="TweetID", right_on="title", how="left")
#merged_df = pd.merge(outputs_df, feedbacks_df[["title", "categories"]], left_on=outputs_df["TweetID"].str[:16], right_on=feedbacks_df["title"].str[:16], how="left")
#merged_df = pd.merge(outputs_df, feedbacks_df[["title", "categories"]], left_on=outputs_df["TweetID"].str[:16], right_on=feedbacks_df["title"].str[:16], how="left")

#Print out all cases where TweetID does not match title
for index, row in merged_df.iterrows():
    if pd.notnull(row["title"]) and row["title"] != row["TweetID"]:
        print(f"TweetID: {row['TweetID']} does not match title: {row['title']}")

#Save the merged dataframe to a new CSV file
#merged_df.to_csv("merged_pranay_gpt_def.csv", index=False)

  feedbacks_df["analysis"] = feedbacks_df["analysis"].str.replace("\\", "\\\\")


JSON decode error in row 156: Expecting ',' delimiter: line 1 column 932 (char 931)
JSON decode error in row 171: Expecting ',' delimiter: line 1 column 92 (char 91)
JSON decode error in row 203: Expecting ',' delimiter: line 1 column 129 (char 128)
JSON decode error in row 297: Expecting ',' delimiter: line 1 column 207 (char 206)
JSON decode error in row 325: Expecting ',' delimiter: line 1 column 295 (char 294)
JSON decode error in row 424: Expecting ',' delimiter: line 1 column 143 (char 142)
JSON decode error in row 426: Expecting ',' delimiter: line 1 column 163 (char 162)
JSON decode error in row 474: Expecting ',' delimiter: line 1 column 111 (char 110)
JSON decode error in row 523: Expecting ',' delimiter: line 1 column 126 (char 125)
JSON decode error in row 538: Expecting ',' delimiter: line 1 column 323 (char 322)
JSON decode error in row 639: Expecting ',' delimiter: line 1 column 129 (char 128)
JSON decode error in row 643: Expecting ',' delimiter: line 1 column 107 (char

In [6]:
merged_df.head()

Unnamed: 0,key_0,Filename,Caption,TweetID,results,title,categories
0,8.16414648199168,TDDL_Twitter_2017_1,I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016,8.16414648199168e+17,"[{'aspect category': 'META', 'aspect term': 'META_Technology_Social-Media', 'aspect content': 'YouTube', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}, {'aspect category': 'META', 'aspect term': 'META_Technology_Social-Media', 'aspect content': 'video', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}, {'aspect category': 'META', 'aspect term': 'META_Technology_Social-Media', 'aspect content': 'http', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}, {'aspect category': 'CONTENDER', 'aspect term': 'CONTENDER_General', 'aspect content': 'Stefanie', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}, {'aspect category': 'CONTENDER', 'aspect term': 'CONTENDER_General', 'aspect content': ' Sargnagel', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}, {'aspect category': 'META', 'aspect term': 'CONTENDER_General', 'aspect content': 'Videoporträt', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}, {'aspect category': 'META', 'aspect term': 'META_Videoportrait', 'aspect content': ' Bachmannpreis', 'opinion target': 'I liked a @YouTube video http://youtu.be/2AAbkbcUjZA?a Stefanie Sargnagel - Videoporträt Bachmannpreis 2016', 'sentiment': 'positive'}]",,
1,8.56571429512499,TDDL_Twitter_2017_100,tddl trtl,8.56571429512499e+17,"[{'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'tddl', 'opinion target': 'tddl trtl', 'sentiment': 'negative'}, {'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'trtl', 'opinion target': 'tddl trtl', 'sentiment': 'negative'}]",,
2,8.82882500384752,TDDL_Twitter_2017_1000,"Meike Feßmann redet sich den Text schön mit der (falschen) Annahme, dass es Teil eines Romans ist #tddl #tddl17",8.82882500384752e+17,"[{'aspect category': 'JURY', 'aspect term': 'TEXT_General', 'aspect content': 'Feßmann', 'opinion target': 'Meike Feßmann redet sich den Text schön mit der (falschen) Annahme, dass es Teil eines Romans ist #tddl #tddl17', 'sentiment': 'negative'}, {'aspect category': 'TEXT', 'aspect term': 'TEXT_General', 'aspect content': 'Text', 'opinion target': 'Meike Feßmann redet sich den Text schön mit der (falschen) Annahme, dass es Teil eines Romans ist #tddl #tddl17', 'sentiment': 'positive'}, {'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'tddl', 'opinion target': 'Meike Feßmann redet sich den Text schön mit der (falschen) Annahme, dass es Teil eines Romans ist #tddl #tddl17', 'sentiment': 'negative'}, {'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'tddl17', 'opinion target': 'Meike Feßmann redet sich den Text schön mit der (falschen) Annahme, dass es Teil eines Romans ist #tddl #tddl17', 'sentiment': 'negative'}]",8.82882500384752e+17,"[{'aspect category': 'JURY', 'aspect term': 'Behaviour', 'opinion target': 'Meike Feßmann', 'opinion expression': 'redet sich den Text schön', 'sentiment': 'positive', 'confidence score': '1'}, {'aspect category': 'JURY', 'aspect term': 'Valuation', 'opinion target': 'die Annahme', 'opinion expression': 'falsch', 'sentiment': 'negative', 'confidence score': '-1'}]"
3,8.82882497335505,TDDL_Twitter_2017_1001,Kindle Hybrid! #keller #peschka #tddl,8.82882497335505e+17,"[{'aspect category': 'TEXT', 'aspect term': 'JURY_Discussion_Valuation', 'aspect content': 'Hybrid', 'opinion target': 'Kindle Hybrid! #keller #peschka #tddl', 'sentiment': 'positive'}, {'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'peschka', 'opinion target': 'Kindle Hybrid! #keller #peschka #tddl', 'sentiment': 'positive'}, {'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'tddl', 'opinion target': 'Kindle Hybrid! #keller #peschka #tddl', 'sentiment': 'negative'}]",8.82882497335505e+17,"[{'aspect category': 'JURY', 'aspect term': 'Name', 'opinion target': 'Keller', 'opinion expression': '', 'sentiment': 'neutral', 'confidence score': '0'}, {'aspect category': 'ALLO-REFERENCE', 'aspect term': 'Other-Author', 'opinion target': 'Peschka', 'opinion expression': '', 'sentiment': 'neutral', 'confidence score': '0'}]"
4,8.82882518852276,TDDL_Twitter_2017_1002,Google-Bildersuche nach »Kindl« #tddl pic.twitter.com/FxQhSVMAjs,8.82882518852276e+17,"[{'aspect category': 'META', 'aspect term': 'META_Technology_Social-Media', 'aspect content': 'Kindl', 'opinion target': 'Google-Bildersuche nach »Kindl« #tddl pic.twitter.com/FxQhSVMAjs', 'sentiment': 'negative'}, {'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'tddl', 'opinion target': 'Google-Bildersuche nach »Kindl« #tddl pic.twitter.com/FxQhSVMAjs', 'sentiment': 'positive'}]",8.82882518852276e+17,"[{'aspect category': 'ALLO-REFERENCE', 'aspect term': 'SCREEN_Film_Tv', 'opinion target': 'Google-Bildersuche nach »Kindl«', 'opinion expression': 'pic.twitter.com/FxQhSVMAjs', 'sentiment': 'neutral', 'confidence score': '0'}]"


In [7]:
def merge_categories(row):
    results = row["results"]
    categories = row["categories"]

    if isinstance(results, list) and all(isinstance(res, dict) for res in results) and \
       isinstance(categories, list) and all(isinstance(cat, dict) for cat in categories):

        merged_categories = []
        overlapping_labels = 0
        non_overlapping_labels_results = len(results)
        non_overlapping_labels_categories = len(categories)

        for category in categories:
            merged_category = category.get("aspect category", "") + "_" + category.get("aspect term", "")
            merged_categories.append(merged_category)

            if any(result.get("aspect category", "") + "_" + result.get("aspect term", "") == merged_category for result in results):
                overlapping_labels += 1
            else:
                non_overlapping_labels_categories += 1

        for result in results:
            if not any(result.get("aspect category", "") + "_" + result.get("aspect term", "") == merged_category for merged_category in merged_categories):
                non_overlapping_labels_results += 1

            aspect_term_first_word = result.get('aspect term', '').split('_')[0]
            result['aspect_term_first_word'] = aspect_term_first_word
            result['matches_aspect_category'] = aspect_term_first_word.upper() == result.get('aspect category', '')

        return pd.Series({
            "categories_merged": merged_categories,
            "overlapping labels": overlapping_labels,
            "non-overlapping labels in results": non_overlapping_labels_results,
            "non-overlapping labels in categories": non_overlapping_labels_categories,
            "results": results
        })

    else:
        return pd.Series({
            "categories_merged": [],
            "overlapping labels": 0,
            "non-overlapping labels in results": 0,
            "non-overlapping labels in categories": len(categories) if isinstance(categories, list) else 0,
            "results": results
        })

def generate_matching_sequence(row):
    results = row["results"]

    if isinstance(results, list) and all(isinstance(res, dict) for res in results):
        matching_sequences = []
        for result in results:
            aspect_term_first_word = result.get('aspect_term_first_word', '')
            aspect_category = result.get('aspect category', '')
            matches_aspect_category = result.get('matches_aspect_category', '')
            matching_sequence = f"{aspect_category}, {aspect_term_first_word}, {matches_aspect_category}"
            matching_sequences.append(matching_sequence)

        return matching_sequences

    return []

def compute_matching_percentage(row):
    results = row["results"]
    matching = row["bert_matching_cats_terms"]

    if isinstance(results, list) and all(isinstance(res, dict) for res in results):
        total_matches = sum(1 for match in matching if 'True' in match)
        total_results = len(results)
        
        if total_results > 0:
            return total_matches / total_results * 100

    return 0

def add_hashtag(row):
    caption = row["Caption"]
    results = row["results"]
    updated_results = []
    
    if isinstance(results, list) and all(isinstance(res, dict) for res in results):
        for result in results:
            updated_result = result.copy()  
            aspect_content = result.get('aspect content', '')
            if aspect_content and ('#' + aspect_content) in caption:
                updated_result["hashtag"] = True
            else:
                updated_result["hashtag"] = False
            updated_results.append(updated_result)
            
    return updated_results

def filter_no_hashtags(row):
    results = row["results"]
    results_no_hashtags = []

    if isinstance(results, list) and all(isinstance(res, dict) for res in results):
        for result in results:
            if not result.get("hashtag", False):
                results_no_hashtags.append(result)
            
    return results_no_hashtags

# Filter the DataFrame to include only rows where categories is not NaN
filtered_df = merged_df[merged_df["categories"].notna()]

# Apply the merge_categories function to create new columns on the filtered DataFrame
filtered_df[["categories_merged", "overlapping labels", "non-overlapping labels in results", "non-overlapping labels in categories", "results"]] = filtered_df.apply(merge_categories, axis=1)

# Generate matching sequence
filtered_df['bert_matching_cats_terms'] = filtered_df.apply(generate_matching_sequence, axis=1)

# Compute matching percentage
filtered_df['bert_matching_cats_terms_percentage'] = filtered_df.apply(compute_matching_percentage, axis=1)

# Add hashtag field
filtered_df['results'] = filtered_df.apply(add_hashtag, axis=1)

# Compare labels and add an "equal_labels" column on the filtered DataFrame
filtered_df["equal_labels"] = filtered_df["categories_merged"].apply(lambda x: len(x)) == filtered_df["categories"].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Apply the filter_no_hashtags function to create the new column
filtered_df['results_no_hashtags'] = filtered_df.apply(filter_no_hashtags, axis=1)

# Get a list of the column names
cols = list(filtered_df.columns)

# Move the new column to right after the "results" column
cols.insert(cols.index('results') + 1, cols.pop(cols.index('results_no_hashtags')))

# Reorder the DataFrame
filtered_df = filtered_df[cols]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[["categories_merged", "overlapping labels", "non-overlapping labels in results", "non-overlapping labels in categories", "results"]] = filtered_df.apply(merge_categories, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[["categories_merged", "overlapping labels", "non-overlapping labels in results", "non-overlapping labels in categories", "results"]] = filtered_df.apply(merge_categories, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

In [8]:
filtered_df.head()

Unnamed: 0,key_0,Filename,Caption,TweetID,results,results_no_hashtags,title,categories,categories_merged,overlapping labels,non-overlapping labels in results,non-overlapping labels in categories,pranay_matching_cats_terms,pranay_matching_cats_terms_percentage,equal_labels
2,8.82882500384752,TDDL_Twitter_2017_1000,"Meike Feßmann redet sich den Text schön mit der (falschen) Annahme, dass es Teil eines Romans ist #tddl #tddl17",8.82882500384752e+17,"[{'aspect category': 'JURY', 'aspect term': 'TEXT_General', 'aspect content': 'Feßmann', 'opinion target': 'Meike Feßmann redet sich den Text schön mit der (falschen) Annahme, dass es Teil eines Romans ist #tddl #tddl17', 'sentiment': 'negative', 'aspect_term_first_word': 'TEXT', 'matches_aspect_category': False, 'hashtag': False}, {'aspect category': 'TEXT', 'aspect term': 'TEXT_General', 'aspect content': 'Text', 'opinion target': 'Meike Feßmann redet sich den Text schön mit der (falschen) Annahme, dass es Teil eines Romans ist #tddl #tddl17', 'sentiment': 'positive', 'aspect_term_first_word': 'TEXT', 'matches_aspect_category': True, 'hashtag': False}, {'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'tddl', 'opinion target': 'Meike Feßmann redet sich den Text schön mit der (falschen) Annahme, dass es Teil eines Romans ist #tddl #tddl17', 'sentiment': 'negative', 'aspect_term_first_word': 'META', 'matches_aspect_category': True, 'hashtag': True}, {'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'tddl17', 'opinion target': 'Meike Feßmann redet sich den Text schön mit der (falschen) Annahme, dass es Teil eines Romans ist #tddl #tddl17', 'sentiment': 'negative', 'aspect_term_first_word': 'META', 'matches_aspect_category': True, 'hashtag': True}]","[{'aspect category': 'JURY', 'aspect term': 'TEXT_General', 'aspect content': 'Feßmann', 'opinion target': 'Meike Feßmann redet sich den Text schön mit der (falschen) Annahme, dass es Teil eines Romans ist #tddl #tddl17', 'sentiment': 'negative', 'aspect_term_first_word': 'TEXT', 'matches_aspect_category': False, 'hashtag': False}, {'aspect category': 'TEXT', 'aspect term': 'TEXT_General', 'aspect content': 'Text', 'opinion target': 'Meike Feßmann redet sich den Text schön mit der (falschen) Annahme, dass es Teil eines Romans ist #tddl #tddl17', 'sentiment': 'positive', 'aspect_term_first_word': 'TEXT', 'matches_aspect_category': True, 'hashtag': False}]",8.82882500384752e+17,"[{'aspect category': 'JURY', 'aspect term': 'Behaviour', 'opinion target': 'Meike Feßmann', 'opinion expression': 'redet sich den Text schön', 'sentiment': 'positive', 'confidence score': '1'}, {'aspect category': 'JURY', 'aspect term': 'Valuation', 'opinion target': 'die Annahme', 'opinion expression': 'falsch', 'sentiment': 'negative', 'confidence score': '-1'}]","[JURY_Behaviour, JURY_Valuation]",0,8,4,"[JURY, TEXT, False, TEXT, TEXT, True, META, META, True, META, META, True]",75.0,True
3,8.82882497335505,TDDL_Twitter_2017_1001,Kindle Hybrid! #keller #peschka #tddl,8.82882497335505e+17,"[{'aspect category': 'TEXT', 'aspect term': 'JURY_Discussion_Valuation', 'aspect content': 'Hybrid', 'opinion target': 'Kindle Hybrid! #keller #peschka #tddl', 'sentiment': 'positive', 'aspect_term_first_word': 'JURY', 'matches_aspect_category': False, 'hashtag': False}, {'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'peschka', 'opinion target': 'Kindle Hybrid! #keller #peschka #tddl', 'sentiment': 'positive', 'aspect_term_first_word': 'META', 'matches_aspect_category': True, 'hashtag': True}, {'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'tddl', 'opinion target': 'Kindle Hybrid! #keller #peschka #tddl', 'sentiment': 'negative', 'aspect_term_first_word': 'META', 'matches_aspect_category': True, 'hashtag': True}]","[{'aspect category': 'TEXT', 'aspect term': 'JURY_Discussion_Valuation', 'aspect content': 'Hybrid', 'opinion target': 'Kindle Hybrid! #keller #peschka #tddl', 'sentiment': 'positive', 'aspect_term_first_word': 'JURY', 'matches_aspect_category': False, 'hashtag': False}]",8.82882497335505e+17,"[{'aspect category': 'JURY', 'aspect term': 'Name', 'opinion target': 'Keller', 'opinion expression': '', 'sentiment': 'neutral', 'confidence score': '0'}, {'aspect category': 'ALLO-REFERENCE', 'aspect term': 'Other-Author', 'opinion target': 'Peschka', 'opinion expression': '', 'sentiment': 'neutral', 'confidence score': '0'}]","[JURY_Name, ALLO-REFERENCE_Other-Author]",0,6,4,"[TEXT, JURY, False, META, META, True, META, META, True]",66.666667,True
4,8.82882518852276,TDDL_Twitter_2017_1002,Google-Bildersuche nach »Kindl« #tddl pic.twitter.com/FxQhSVMAjs,8.82882518852276e+17,"[{'aspect category': 'META', 'aspect term': 'META_Technology_Social-Media', 'aspect content': 'Kindl', 'opinion target': 'Google-Bildersuche nach »Kindl« #tddl pic.twitter.com/FxQhSVMAjs', 'sentiment': 'negative', 'aspect_term_first_word': 'META', 'matches_aspect_category': True, 'hashtag': False}, {'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'tddl', 'opinion target': 'Google-Bildersuche nach »Kindl« #tddl pic.twitter.com/FxQhSVMAjs', 'sentiment': 'positive', 'aspect_term_first_word': 'META', 'matches_aspect_category': True, 'hashtag': True}]","[{'aspect category': 'META', 'aspect term': 'META_Technology_Social-Media', 'aspect content': 'Kindl', 'opinion target': 'Google-Bildersuche nach »Kindl« #tddl pic.twitter.com/FxQhSVMAjs', 'sentiment': 'negative', 'aspect_term_first_word': 'META', 'matches_aspect_category': True, 'hashtag': False}]",8.82882518852276e+17,"[{'aspect category': 'ALLO-REFERENCE', 'aspect term': 'SCREEN_Film_Tv', 'opinion target': 'Google-Bildersuche nach »Kindl«', 'opinion expression': 'pic.twitter.com/FxQhSVMAjs', 'sentiment': 'neutral', 'confidence score': '0'}]",[ALLO-REFERENCE_SCREEN_Film_Tv],0,4,2,"[META, META, True, META, META, True]",100.0,True
5,8.82882554000536,TDDL_Twitter_2017_1003,Klaus Kastbergers kreisenden Fuß hätte ich gern als Gif. #tddl,8.82882554000536e+17,"[{'aspect category': 'META', 'aspect term': 'TEXT_General', 'aspect content': 'Gif', 'opinion target': 'Klaus Kastbergers kreisenden Fuß hätte ich gern als Gif. #tddl', 'sentiment': 'positive', 'aspect_term_first_word': 'TEXT', 'matches_aspect_category': False, 'hashtag': False}, {'aspect category': 'META', 'aspect term': 'META_Main-Event', 'aspect content': 'tddl', 'opinion target': 'Klaus Kastbergers kreisenden Fuß hätte ich gern als Gif. #tddl', 'sentiment': 'positive', 'aspect_term_first_word': 'META', 'matches_aspect_category': True, 'hashtag': True}]","[{'aspect category': 'META', 'aspect term': 'TEXT_General', 'aspect content': 'Gif', 'opinion target': 'Klaus Kastbergers kreisenden Fuß hätte ich gern als Gif. #tddl', 'sentiment': 'positive', 'aspect_term_first_word': 'TEXT', 'matches_aspect_category': False, 'hashtag': False}]",8.82882554000536e+17,"[{'aspect category': 'JURY', 'aspect term': 'Appearance_Clothing', 'opinion target': 'Klaus Kastberger', 'opinion expression': 'kreisenden Fuß', 'sentiment': 'neutral', 'confidence score': '0'}, {'aspect category': 'META', 'aspect term': 'Technology_Social-Media', 'opinion target': 'Gif', 'opinion expression': 'gern als Gif', 'sentiment': 'positive', 'confidence score': '1'}]","[JURY_Appearance_Clothing, META_Technology_Social-Media]",0,4,4,"[META, TEXT, False, META, META, True]",50.0,True
6,8.82882648758264,TDDL_Twitter_2017_1004,Jury eher so halbe-halbe. In sich gespalten. #tddl,8.82882648758264e+17,"[{'aspect category': 'JURY', 'aspect term': 'JURY_Discussion_Valuation', 'aspect content': 'tddl', 'opinion target': 'Jury eher so halbe-halbe. In sich gespalten. #tddl', 'sentiment': 'negative', 'aspect_term_first_word': 'JURY', 'matches_aspect_category': True, 'hashtag': True}]",[],8.82882648758264e+17,"[{'aspect category': 'JURY', 'aspect term': 'Discussion_Valuation', 'opinion target': 'die Jury', 'opinion expression': 'eher so halbe-halbe. In sich gespalten', 'sentiment': 'neutral', 'confidence score': '0'}]",[JURY_Discussion_Valuation],0,2,2,"[JURY, JURY, True]",100.0,True


In [None]:
filtered_df.to_csv('2017_bert_vs_gpt.csv', index=False)