In [None]:
import pandas as pd
import re
import random
import datetime
import time

LEN_VOCABULARY=1866

In [None]:
tweet_df=pd.read_csv('data\\tweets.csv')

#ignores the retweets
tweet_df=tweet_df[tweet_df['is_retweet'] == False]
tweet_df=tweet_df["text"].astype(str).values.tolist()

def clean_tweets(tweet_list):
    cleaned_tweets = []
    char_rem = list(set("#$%'()*+,./:;<=>?[\]^_`{|}~" + "“" + "”"))
    for tweet in tweet_list:
        # Removes links starting with "https://"
        cleaned_tweet = re.sub(r'https?://\S+', '', tweet)
        cleaned_tweet = re.sub('\s-\s',' ', cleaned_tweet)
        # Makes all characters lowercase
        cleaned_tweet = cleaned_tweet.lower()
        # ignores characters that are inside char_rem
        cleaned_tweet = ''.join(char for char in cleaned_tweet if char not in char_rem)
        cleaned_tweets.append(cleaned_tweet)
    array_tweets = [tweet.split() for tweet in cleaned_tweets]
    return array_tweets

#add start tag and end tag
def add_begin_end(tweet_list):
    for tweet in tweet_list:
        tweet.insert(0,"<b>")
        tweet.append("<end>")
    return tweet_list

#generates all the bigrams from array_tweets, and saves them in the bidict dictionary key:bigram value:occurence
def bigrams_generator(array_tweets):
    bidict = {}
    for tweet in array_tweets:
        for i in range(0,len(tweet)-1):
            jword= tweet[i]+" "+tweet[i+1]
            if jword in bidict:
                bidict.update({jword:(bidict.get(jword)+1)})
            else:
                bidict.update({jword:1})
    return bidict            

#calculates the occurence of every single word (unigram) key:unigram value:occurence
def unigram_generator(array_tweets):
    wdict = {}
    for tweet in array_tweets:
        for word in tweet:
            if word in wdict:
                wdict.update({word:(wdict.get(word)+1)})
            else:
                wdict.update({word:1})
    return wdict

#generates a tweet from a random bigram
def generate_tweet(bigram_dictionary):
    starting_point=[key for key in bigram_dictionary if key.startswith("<b>")]
    sentence=random.choice(starting_point).split()
    while(sentence[-1]!="<end>"):
        partial_key=sentence[-1]
        max_value=0
        max_key="<end>"
        for key in bigram_dictionary:
            splitted_key=key.split()
            if(splitted_key[0] == partial_key):
                current_value=bigram_dictionary.get(key)
                if(current_value>max_value):
                    max_value=current_value
                    max_key=splitted_key[1]
        sentence.append(max_key)
    return sentence

#probrability estimation of each trigram with normalization
def estimate_probability(bigram_dict,unigram_dict):
    for key in bigram_dict:
        first_word=key.split()[0]
        freq_fw=unigram_dict.get(first_word)
        bigram_dict.update({key:((bigram_dict.get(key)+1)/(freq_fw+LEN_VOCABULARY))})
    return bigram_dict

# add "Trump posted on" before the generated tweet
def ultimate_tweet(generated_tweet):
    str_tweet = ' '.join(map(str,generated_tweet))
    current_date = datetime.datetime.now()
    current_hour = time.strftime("%H:%M")
    str_tweet = str_tweet.replace("<b>", "Trump posted on " + str(current_date.day) + "-" + str(current_date.month) + "-" + str(current_date.year) + " at " + str(current_hour) + " :")
    str_tweet = str_tweet.replace("<end>", "")
    return str_tweet

array_tweets = clean_tweets(tweet_df)
array_tweets = add_begin_end(array_tweets) 
bigram_dict = bigrams_generator(array_tweets)
unigram_dict= unigram_generator(array_tweets)

bigram_dict = estimate_probability(bigram_dict,unigram_dict)

generated_tweet = generate_tweet(bigram_dict)

print(ultimate_tweet(generated_tweet))
