In [1]:
import json
import pandas as pd
import re

In [2]:
file_path = 'datasets/decahose_polls_2021-08.txt'

In [3]:
data = []

In [4]:
with open(file_path, 'r') as file:
    for line in file:
        # Parse each line as a JSON object
        tweet = json.loads(line)

        # Extract relevant information
        tweet_data = {
            'text': tweet.get('text', ''),
            'user_name': tweet['user']['name'] if 'user' in tweet else '',
            'user_screen_name': tweet['user']['screen_name'] if 'user' in tweet else '',
            'user_location': tweet['user'].get('location', ''),
            'user_followers_count': tweet['user'].get('followers_count', 0),
            'retweet_count': tweet.get('retweet_count', 0),
            'favorite_count': tweet.get('favorite_count', 0),
            'is_retweet': 'retweeted_status' in tweet,
            'tweet_language': tweet.get('lang', ''),
            'created_at': tweet.get('created_at', ''),
            'tweet_source': tweet.get('source', ''),
            'in_reply_to_status_id': tweet.get('in_reply_to_status_id', None),
            'in_reply_to_user_id': tweet.get('in_reply_to_user_id', None),
            'in_reply_to_screen_name': tweet.get('in_reply_to_screen_name', None),
            'hashtags': [hashtag['text'] for hashtag in tweet['entities']['hashtags']] if 'entities' in tweet and 'hashtags' in tweet['entities'] else [],
            'user_mentions': [mention['screen_name'] for mention in tweet['entities']['user_mentions']] if 'entities' in tweet and 'user_mentions' in tweet['entities'] else [],
            'place': tweet.get('place', {}).get('full_name', '') if tweet.get('place') else '',
            'polls': [poll['options'] for poll in tweet['entities']['polls']] if 'entities' in tweet and 'polls' in tweet['entities'] else [],
            'is_extended_tweet': 'extended_tweet' in tweet,
            'extended_tweet_full_text': tweet['extended_tweet']['full_text'] if 'extended_tweet' in tweet else '',
        }

        # Append the extracted data to the list
        data.append(tweet_data)

In [6]:
df = pd.DataFrame(data)
print(df.head())

                                                text  \
0  RT @CaesarsSports: Thursday Dingers is coming ...   
1  RT @Openly: Should sports bodies allow interse...   
2                       RT @TheSkyBlueHub: Defenders   
3  RT @kooky__official: 1등 케이팝 최고의 아기펭귄에게는 🐧kooky...   
4                                       @iGOLDWING94   

                                       user_name user_screen_name  \
0                                     CoachPicks    TheCoachPicks   
1                       Lord Rev Col: Igneous Mo       touristcol   
2  The ßky Blue Hub (everything & anything CCFC)    TheSkyBlueHub   
3                               abeyours 🐰 D-193         huniehoe   
4                                  anto lvs melu          iTBSD91   

           user_location  user_followers_count  retweet_count  favorite_count  \
0                   None                  1147              0               0   
1  Northampton or Malawi                   444              0               0   
2    

In [7]:
df['cleaned_text'] = df['text'].apply(lambda x: re.sub(r'http\S+', '', x))

# Removing Twitter handles (@user)
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: re.sub(r'@\S+', '', x))

# Removing hashtags (#hashtag)
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: re.sub(r'#\S+', '', x))

# Removing special characters, numbers, punctuations (except for spaces)
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Optional: Convert text to lowercase
df['cleaned_text'] = df['cleaned_text'].str.lower()

In [8]:
df.fillna(method='ffill', inplace=True)

In [9]:
df['tweet_length'] = df['text'].apply(len)

In [23]:
df = pd.DataFrame(data)
df = df[['text', 'polls']]  # Keep only the columns: "text" and "polls"
df = df.sample(n=4000, random_state=1)  # Shuffle and reduce the dataset to a 4000 rows
df.to_csv('reduced_dataset.csv', index=False)  # Save the reduced dataset to a csv file


In [14]:
print(df["polls"].iloc[10])


[[{'position': 1, 'text': 'Sim'}, {'position': 2, 'text': 'Claro'}, {'position': 3, 'text': 'Óbvio'}, {'position': 4, 'text': 'Demorou'}]]


In [None]:
# import pandas as pd

# # Initialize an empty DataFrame
# df = pd.DataFrame(columns=["question", "Mistral_Output"])

# # Iterate over all questions
# for question in dataset_test['history']:
#     model_input = tokenizer2(question, return_tensors="pt").to("cuda")

#     ft_model.eval()
#     with torch.no_grad():
#         output = eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=500)[0], skip_special_tokens=True)

#     # Print the output
#     print(f"Question: {question}\nOutput: {output}\n")

#     # Append the question and its output to the DataFrame
#     df = df.append({"question": question, "Mistral_Output": output}, ignore_index=True)

#     # Save the DataFrame to a CSV file
#     df.to_csv("test_eval.csv", index=False)
