In [1]:
import json
import keyword
import unicodedata
from collections import defaultdict
import pandas as pd
from extract import SocialETL, construct_query_for_twarc, extract_tags, SocialDB, UserETL
import hashtags as h
import hashtags_readcsv as r_csv
from rich import print
from rich.console import Console
from rich.progress import track
from rich.table import Table

from pathlib import Path
import numpy as np
from extract import extract_tags

## Store hashtags into `twee_hash_800.csv`

In [2]:
def ensure_latin(s):
    return (
        unicodedata.normalize("NFKD", s).encode("latin-1", "ignore").decode("latin-1")
    )


def construct_query_for_pandas(root_tags: list) -> str:
    my_query = " or ".join(root_tags)
    return my_query


def create_score(df: pd.DataFrame, one_hashtag: str, root_tags: dict) -> list:
    # takes a df of all tweets, each row is a tweet, each column is
    # a hashtag either [True | False] depending on whether the hashtag
    # is in the tweet. Takes one_hashtag, the hashtag to score, and
    # root_tags, a dict of {category: [hashtags]}
    # TODO: make it so it accepts a list of {category: [hashtags]} instead of just one
    threshold_support = 0.9 * len(df) / 10000  # set the threshold_support here!!

    if threshold_support < 1:
        threshold_support = 1
    # print(f"Looking for {one_hashtag} with support {threshold_support}…")
    mask = df[one_hashtag]
    df = df[mask]
    # print(f"I found {len(df)} tweets with {one_hashtag}")
    if len(df) < threshold_support:
        return False
    """for col in df:
        if df[col].any():
            print(f"We have {col} hashtag")"""
    results = []
    # START OF TESTING:
    """    print(df.info())
    print(df.describe())
    print(df.head())
    print(f"[purple]Dude, here's the breakdown:")
    for tag in root_tags.values():
        print(f"[blue]This is {tag[0]}:")
        print(df[tag[0]])"""
    for category, tag_madre in root_tags.items():
        try:
            my_query = construct_query_for_pandas(tag_madre)
            # print(f"The pandas query is: [red]{my_query}")
            temp_df = df.query(my_query)
            # print(f"Holy jezuz: {len(temp_df)} and {len(df)}")
            results.append(round(len(temp_df) / len(df), 4))
        except KeyError:
            results.append(0)

    # print(results)
    return (results, len(df))


def do_search(tagmadre, pages):
    # construct the initial query to Twarc
    query_madre = construct_query_for_twarc(tagmadre)
    m = SocialETL(
        query=f"({query_madre})",
        pages=pages,
        recent=False,
    )

    # dropping any tweets with no hashtags (I think)
    tweets_with_hashtag = m.df[["id", "entities.hashtags"]].dropna()
    print(
        f"{len(m.df)} tweets retrieved\nwith query '{query_madre}'\nof which {len(tweets_with_hashtag)} tweets with at least 1 hashtag."
    )

    # evaluate the string in "entities.hashtags" to an actual list of dicts
    tweets_with_hashtag["entities.hashtags"] = tweets_with_hashtag[
        "entities.hashtags"
    ].map(eval)

    # make a simple list of strings, one hashtag is one string, into column "tags"
    tweets_with_hashtag["tags"] = tweets_with_hashtag["entities.hashtags"].map(
        extract_tags
    )
    tweets_with_hashtag = tweets_with_hashtag.drop(columns="entities.hashtags")

    # hashtags: EXPLODE *musica dei power ranger*
    all_hashtags = set(tweets_with_hashtag["tags"].explode())

    # keyword level: eliminate all python keywords from columns because otherwise we are in no man's land
    keywordsss = keyword.kwlist
    keywordsss.extend(keyword.softkwlist)
    all_hashtags = all_hashtags.difference(keywordsss)
    all_hashtags = all_hashtags.difference(set(("",)))

    print(f"We have {len(all_hashtags)} unique hashtags.")

    tweets_with_hashtag.set_index("id", inplace=True)

    col_h = sorted(list(all_hashtags))
    df_h = pd.DataFrame(columns=col_h)
    tweets_with_hashtag = pd.concat([tweets_with_hashtag, df_h], axis=1)
    tweets_with_hashtag = tweets_with_hashtag.fillna(False)

    def assign_hashtag_to_tweet(row: pd.Series) -> pd.Series:
        for tag in row["tags"]:
            if tag in all_hashtags:
                row.loc[tag] = True
        return row

    tweets_with_hashtag = tweets_with_hashtag.apply(assign_hashtag_to_tweet, axis=1)
    tweets_with_hashtag = tweets_with_hashtag.drop(columns=["tags"])

    for madre in tagmadre.values():
        print(f"[red]Let's write the describe of column '{madre[0]}'")
        print(tweets_with_hashtag[madre[0]].describe())

    tweets_with_hashtag.to_csv(f"twee_hash_{pages}.csv")
    print("[green]All done.")


if __name__ == "__main__":
    # params
    top_results_to_take = 3
    pages_to_do = 800

    # set the initial "parent" hashtags for each category
    tag_madre = {
        "proukr": ["slavaukraini"],
        "prorus": ["istandwithputin"],
        "pax": ["stopwarinukraine"],
    }

    # code
    # first run
    do_search(tag_madre, pages_to_do)


## Categorization of hashtags stored in `twee_hash_800.csv` and creation of two .json: `hashtags_800.json`, `supports_800.json`

In [3]:
df=pd.read_csv('twee_hash_800.csv')

In [4]:
df.shape

(75816, 6608)

In [5]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,100,1000smiles,100days,100daysofacruelinvasion,100daysofdestruction,100daysofgenocide,100daysofglory,100daysofhell,100daysofmlcode,...,zombietwitterlalaland,zoom,zoopatrol,zoreslav,zovstal,zsu,zurich,zuzana2,zwangsarbeit,zwitserland
0,1536860968105435136,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1536860867295420417,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1536859684258598912,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1536859558177832960,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1536859410919833600,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,1536859401638023168,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,1536859350635282432,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,1536859313444114440,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,1536859263083085824,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,1536858986607169536,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
def do_search(tagmadre, pages):
    # construct the initial query to Twarc
    tweets_with_hashtag = pd.read_csv(Path("twee_hash_800.csv"), index_col=0)

    # hashtags: EXPLODE *musica dei power ranger*
    all_hashtags = set(tweets_with_hashtag.columns)

    # keyword level: eliminate all python keywords from columns because otherwise we are in no man's land
    keywordsss = keyword.kwlist
    keywordsss.extend(keyword.softkwlist)
    all_hashtags = all_hashtags.difference(keywordsss)
    all_hashtags = all_hashtags.difference(set(("",)))

    # up to here we have only dealt with a dataframe of tweets. Now we switch to dataframe of hashtags

    # create a dataframe with all hashtags and their scores
    all_hashtags_as_dict = {}
    supp = {}
    for hashtag in track(all_hashtags):
        # discard based on support
        pass
        # calculate scores only on hashtags with enough support
        score = create_score(tweets_with_hashtag, hashtag, tagmadre)
        if score is not False:
            # score[0] is the scores, score[1] is the support
            all_hashtags_as_dict[hashtag] = score[0]
            supp[hashtag] = score[1]
    all_hashtags_df = pd.DataFrame.from_dict(
        all_hashtags_as_dict, orient="index", columns=tagmadre
    )

    # now, "categorize" hashtags. The hashtag gets the category of its max score,
    # as long as it is > `threshold_certainty`
    # TODO: remember that we still need to account for hashtag support
    # (i.e. number of tweets supporting that hashtag)
    threshold_certainty = 0.4
    tags_categorized = defaultdict(list)
    for hashtag, scores in all_hashtags_df.iterrows():
        # print(hashtag, scores)  # TODO: check if the scores are different enough among 3 categories
        # print(scores.idxmax(), scores.max())
        if scores.max() > threshold_certainty:
            tags_categorized[scores.idxmax()].append((hashtag, scores.max()))
            other_scores = set(scores).difference(set((scores.max(),)))
            for sco in other_scores:
                if scores.max() < 1.2 * sco:
                    print(f"Attention. For {hashtag} the scores are\n{scores}")

    # tags_categorized.sort(key=lambda x: x[1], reverse=True)
    """    tags_categorized = {
        k: sorted(v, key=lambda item: item[1], reverse=True)
        for k, v in tags_categorized.items()
    }"""

    return tags_categorized, supp


if __name__ == "__main__":
    # params
    top_results_to_take = 3
    pages_to_do = 800

    # set the initial "parent" hashtags for each category
    tag_madre = {
        "proukr": ["slavaukraini"],
        "prorus": ["istandwithputin"],
        "pax": ["stopwarinukraine"],
    }

    # code
    # first run
    end_results, tags_support = do_search(tag_madre, pages_to_do)

    # second run
    """
    end_results = {
        k: [x[0] for x in v[:top_results_to_take]] for k, v in end_results.items()
    }
    print("miao", end_results)
    end_results = do_search(end_results)
    """

    with open(f"hashtags_{pages_to_do}.json", "w", encoding="utf-8") as f:
        json.dump(end_results, f, ensure_ascii=False, indent=4)
    with open(f"supports_{pages_to_do}.json", "w", encoding="utf-8") as f:
        json.dump(tags_support, f, ensure_ascii=False, indent=4)

## Removal of *"neutral"* hashtags

In [9]:
with open('hashtags_800.json', 'r') as f:
    my_dict = json.load(f)
my_dict

{'proukr': [['reallystandwithukraine', 1.0],
  ['schroeder', 1.0],
  ['bayraktar', 1.0],
  ['spaceshost', 1.0],
  ['admitukrainetoeu', 1.0],
  ['chipineurosforweapons', 1.0],
  ['denhaagiswaiting', 1.0],
  ['nowsmoking', 1.0],
  ['gofundme', 1.0],
  ['uk', 0.9216],
  ['dictatorputin', 0.8571],
  ['kharkiv', 0.8594],
  ['putinasesino', 0.7222],
  ['hitler', 1.0],
  ['summer', 0.625],
  ['ukrainiansoldier', 0.9846],
  ['freeiran2022', 1.0],
  ['de4l', 1.0],
  ['mykolaiev', 1.0],
  ['noflyzone', 0.8462],
  ['cybersecurity', 1.0],
  ['scholz', 0.9936],
  ['glory', 1.0],
  ['wojnawukrainie', 1.0],
  ['refugees', 0.6296],
  ['deadrussiansoldier', 1.0],
  ['leica', 1.0],
  ['pologi', 1.0],
  ['eugenekibets', 1.0],
  ['grune', 1.0],
  ['c40', 1.0],
  ['notmovingon', 1.0],
  ['ourbodiesourchoice', 1.0],
  ['laakenannabis', 1.0],
  ['antiracism', 1.0],
  ['ontariovotes', 1.0],
  ['scholzmussweg', 1.0],
  ['anonymousnews', 1.0],
  ['salvini', 1.0],
  ['melitopol', 0.9945],
  ['saveger6', 1.0],
  

In [12]:
table = Table(title="# of hashtags found")

table.add_column("Category", justify="left", style="cyan", no_wrap=True)
table.add_column("# of hashtags", justify="right", style="red", no_wrap=True)


for k, v in my_dict.items():
    table.add_row(str(k), str(len(v)))
    
console = Console()
console.print(table)

In [13]:
my_dict_copy=my_dict.copy()

In [14]:
#lista di hahstags che sono palesemente IDONTCARE
remove_hash_set= set(['watch','nftcommunity','contemporaryart','sisterhoodforever','cryptoartist','arte','cigar','oilpaintings','drewdiplomat','lgbtq','bonvoyage','transally','tabak','avanticondraghi','artshare','tabakespecial','ourbodiesourchoice','yemencantwait','climatecrisis','beautiful','google','catsontwitter','karma','bbc','artwork','mustread','freesyria','taiwanisnotchina','johnsonout170','dove','backboris','clouds','cats','manifesta14prishtina','31luglio','nowsmoking','bastille','mondaymorning','coffee','punk','14juillet','billsmafia','ilustration','etc','infosec','drawingart','music','metaverse','naturephotography','johnsonout174','1988massacre','badass','cybersecurity','monochrome','newprofilepic','mariodraghi','redcarpet','summer','visitkotkahamina','landscape','eurovision','iostocondraghi','error404','retwit','research','cbdnft','sunflowers','kittyloafmonday','cryptoart','nft','amazing','voteblue2022','eurovision2023','cigarlife','brexithasfailed','blog','graziepresidentedraghi','fantastic','poems','poet','cityphotography','facciamorete','cityscape','catsoftwittter','catlover','elezioni2022','climateemergency','freeiran','animals','nftcollectors','problem','catlovers','sea','nftartist','blacklivesmatter','whateverittakes','fresh','medvedev','toriesout22','standwithboris','dogsoftwitter','toriesout25','followbackfriday','tshirt','cutepets','gracias','lgbt','art','boots','salvini','boycottleroymerlin','macron','sketch','fuckserbia','figurativepainting','500px','rammstein','oilpainting','banassaultweaponsnow','crowdfunding','retweeet','catsoftwitter','spotify','toriesout15','draw','mondaythoughts','voteblue','womensrightsarehumanrights','pentesting','nfts','vogue','tiktok','blackandwhitephoto','water','nightvision','pentagon','cybersec','glory','lifestyle','animalrescue','kindnessismagic','catslover','throwbackthursday','twitter','cbdoil','caturday','nftcollector','ford','biden','ourbluevoice','flashlights','14juillet2022','mybodymychoice','highlandparkstrong','covidisnotover','photography','boxing','splendidfeet','nftshills','covid19','collageart','frexit','streaming','nftjapan','phrases','nftproject','oscars','cocaino','cryptocurrency','ai','artistoninstagram','artistsontwitter','promotion','painting','graffiti','nftcommumity','ballerina','eyes','lyrik','collagen','nftnyc2022','mariopaciolla','hollywood','mothersday','aiart','indie','giulioregeni','iphone','nftartwork','nftcollections','sexymonday','happymothersday2022','covid_19','youtuber','collageartwork','imaginedragons','bidenlsalaughingstock',])
#lista di hashtags "neutrali" posizioni geografiche 
remove2_hash_set= set(['polish','lithuania','ucraine','romaniaukraine','milano','georgia','minsk','estonia','crimea','mariupol','ukrain','odesa','azerbaijan','ukrania','moskau','bielorussia','paris','syria','norway','buffalo','finland','germany','donbass','india','mosca','chicago','ohio','odessa','ucraina','kiev','moscow','france','kharkiv','northvancouver','europe','lettland','chaplynka','europa','italia','olenivka','texas','africa','vancouver','turkey','afghanistan','berlin','usa','tallinn','california','uk','switzerland','northkorea','japan','chinataiwan','australia',])

In [15]:
#hashtags in "neutrali" che riguardano luoghi della guerra
#'ucraine','romaniaukraine','crimea','mariupol','ukrain','odesa','ukrania','moskau','donbass','mosca','odessa','ucraina','kiev','moscow','kharkiv','chaplynka','olenivka',

In [16]:
remove=set.union(remove_hash_set, remove2_hash_set)
for i in remove:
    for k,v in my_dict_copy.items():
        for e in v:
            #print(v)
            if e[0] == i:
                v.remove(e)

In [18]:
table = Table(title="# of hashtags found")

table.add_column("Category", justify="left", style="cyan", no_wrap=True)
table.add_column("# of hashtags", justify="right", style="red", no_wrap=True)


for k, v in my_dict_copy.items():
    table.add_row(str(k), str(len(v)))

console.print(table)

## Write of the final .json: `hashtags_final.json`

In [19]:
with open(f"hashtags_final.json", "w", encoding="utf-8") as f:
        json.dump(my_dict_copy, f, ensure_ascii=False, indent=4)