In [None]:
import numpy as np
import pandas as pd
import re
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

In [None]:
df=pd.read_csv("only_climate_tweets.csv")
print(len(df))
df=df.drop_duplicates(subset='tweet_id')
len(df)

In [None]:
data=[df["user_screen_name"],df["tweet_id"],df["tweet_created_at"],df["tweet_full_text"]]
df_preproc=pd.concat(data, axis=1)

In [None]:
df_preproc.head()

In [None]:
# pip install spacy
# !python -m spacy download da_core_news_lg

In [None]:
import spacy
nlp = spacy.load("da_core_news_lg")
all_stopwords = nlp.Defaults.stop_words

In [None]:
def remove_stopwords(sentence):
    """
    Removes Danish stopwords imported from spacy and returns filtered string
    """  
    tokens = sentence.split(" ")
    tokens_filtered= [word for word in tokens if not word in all_stopwords]
    return (" ").join(tokens_filtered)

def preproccessor(string, verb_noun_only=False):
    """
    Helper function for lemmatizer().
    Preprocesses the string by:
    1) lowercasing string
    2) removing urls
    3) remove mentions, hashtags, and RT
    4) remove non-alphanumerical values
    5) remove multiple whitespaces
    6) remove trailing whitespaces
    """  
    # Lowercase
    string=string.lower()
    
    # Remove url
    string=re.sub(
        r"(https|http?):\/\/(\w|\.|\/|\?|\=|\&|\%)*\b",
        "", 
        string)
    
    # Remove weird remaining http
    string = re.sub(r'https?', '', string)
    
    # Remove mentions, hashtags, and RT
    string=re.sub("@\w+|#\w+|^rt","", string)
    
    # Remove non-alphanumerical values
    string=re.sub(r"\W"," ", string)
 
    # Remove more than one whitespace
    string=re.sub(r"\s{2,}", " ", string)
    
    # Remove trailing whitespaces
    string=string.strip()
    
    # Remove stopwords
    string=remove_stopwords(string)
    
    # Create and return doc object
    return nlp(string)  
   
def lemmatizer(string):
    """
    Lemmatize the preprocessed string using spacy's lemmatizer
    """
    doc=preproccessor(string)
    
    lemma=" ".join(
        [token.lemma_ for token in doc if len(str(token.lemma_))>3]
    )
    return lemma

def lemmatizer_reduced(string):
    """
    Reduce the allready lemmatized string by only including proper nouns, nouns, and verbs
    """
    doc=nlp(string)
    verb_and_noun=" ".join(
    [token.lemma_ for token in doc if token.pos_ in ["PROPN","NOUN","VERB"]]
    )
    return verb_and_noun

In [None]:
# Iterates throuhh entire dataframe twice. Takes a long time.
df_preproc["tweet_text_lemma"]=[lemmatizer(string) for string in df_preproc.tweet_full_text.to_numpy()]
df_preproc["tweet_text_lemma_reduced"]=[lemmatizer_reduced(string) for string in df_preproc.tweet_text_lemma.values]
# df_preproc.dropna(axis=0,subset=["tweet_text_lemma_reduced"],inplace=True)

In [None]:
# Save data to csv
compression=dict(method="zip",archive_name="lemma.csv")
df_preproc.to_csv("lemma.zip",compression=compression,index=False)