In [35]:
import pandas as pd
import json
import re

In [37]:
def txt_to_df(txt_path):
    """
    Powerhouse function to take raw scraped twitter data
    into a DataFrame of just the tweet texts and years
    Args:
        txt_path: The path for the saved file containing
            the tweet data in json format
    Returns:
        A DataFrame containing just the raw tweet texts and years tweeted
    """
    path = txt_path
    tweets_file = open(path, "r")
    tweets_data = []
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            tweets_data.append(tweet)
        except:
            continue
    tweet = pd.DataFrame(tweets_data)
    tweet = tweet[tweet["lang"] == "pt"]
    try:
        tweet = tweet[["full_text", "created_at", "retweeted_status"]]
        tweet["long_text"] = tweet["full_text"]
        tweet["long_text"] = tweet.apply(ext_rt, axis=1)
        tweet["long_text"] = tweet.apply(rm_links, axis=1)
        tweet["year"] = pd.to_datetime(tweet["created_at"])
        tweet["year"] = tweet.apply(to_year, axis=1)
        return tweet[["long_text", "year"]]
    except:
        try:
            tweet["long_text"] = tweet["content"]
            tweet["long_text"] = tweet.apply(rm_links, axis=1)
            tweet["year"] = pd.to_datetime(tweet["date"])
            tweet["year"] = tweet.apply(to_year, axis=1)
            return tweet[["long_text", "year"]]
        except:
            return tweet[["long_text", "year"]]


def ext_rt(row):
    """
    Function to extract full retweet text
    For use in txt_to_df and .apply or .map functionality
    Args:
        row: row from dataframe
    Returns:
        Full text of a retweeted tweet
    """
    try:
        if type(row["retweeted_status"]) == dict:
            return row["retweeted_status"]["extended_tweet"]["full_text"]
        else:
            return row["long_text"]
    except:
        return row["long_text"]


def to_year(row):
    """
    Function to extract year
    For use in txt_to_df and .apply or .map functionality
    Args:
        row: row from dataframe
    Returns:
        Year a tweet was tweeted
    """
    return row["year"].year


def rm_links(row):
    """
    Function to remove links from tweets
    For use in txt_to_df and .apply or .map functionality
    Args:
        row: row from dataframe
    Returns:
        Tweet without links
    """
    text = row["long_text"]
    text = re.sub(r"https:\S*", "", text)
    row["long_text"] = text
    return row["long_text"]


In [67]:
path = "sustentabilidade_15_21.txt"

txt_to_df(path).to_csv("sustentabilidade.csv")

In [66]:

txt_to_df(path)

Unnamed: 0,long_text,year
0,Águas do Agro busca sustentabilidade do meio r...,2021
1,Prazer d ter o novo livro d @soniafavaretto #v...,2021
2,@senadorhumberto @mmc032 Perdi exatamente por ...,2021
3,Vendo as fotos e vídeos da galera no show de o...,2021
4,São Roque atualiza objetivos da cartilha de su...,2021
...,...,...
49833,Do compromisso em emissão zero de carbono à en...,2021
49834,"Mas fora isso, jogão da porra, mano. Divertido...",2021
49835,"Hoje dia 22 de Abril é o Dia da Terra, um dia ...",2021
49836,"Queria poder descomplicar a vida desse jeito, ...",2021
