In [12]:
import pandas as pd
import openai
import random
import time

openai.api_key = "API_KEY"

In [82]:
def text_cleaning(df_2):
    #remove URL
    df_2['tweet_proc'] = df_2['Tweet'].str.replace(r'http(\S)+', r'')
    df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'http ...', r'')
    df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'http', r'')
    df_2[df_2['tweet_proc'].str.contains(r'http')]

    # remove RT, @
    df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
    df_2[df_2['tweet_proc'].str.contains(r'RT[ ]?@')]
    df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'@[\S]+',r'')

    #remove non-ascii words and characters
    df_2['tweet_proc'] = [''.join([i if ord(i) < 128 else '' for i in text]) for text in df_2['tweet_proc']]
    df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'_[\S]?',r'')

    #remove &, < and >
    df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'&amp;?',r'and')
    df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'&lt;',r'<')
    df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'&gt;',r'>')

    # remove extra space
    df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'[ ]{2, }',r' ')

    # insert space between punctuation marks
    df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
    df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

    # lower case and strip white spaces at both ends
    df_2['tweet_proc'] = df_2['tweet_proc'].str.lower()
    df_2['tweet_proc'] = df_2['tweet_proc'].str.strip()
    #We will calculate the length of each tweet and only keep unique tweets that are 3 words or longer.

    df_2['tweet_proc_length'] = [len(text.split(' ')) for text in df_2['tweet_proc']]
    df_2['tweet_proc_length'].value_counts()

    df_2 = df_2[df_2['tweet_proc_length']>3]
    df_2 = df_2.drop_duplicates(subset=['tweet_proc'])
    tweets_list = df_2['tweet_proc']
    return tweets_list

# Extract informations and tweets of each Account to use in prompts 
def information(df, username):
    df = df[df['conversationId'] == df['id']]
    df = df.reset_index(drop=True)
    tweets = text_cleaning(df)
    tweets_shuffled = tweets.sample(frac=1)
    tweets_shuffled = tweets_shuffled.reset_index(drop=True)

    df_accounts = pd.read_csv("accounts.csv")
    df_accounts = df_accounts[df_accounts['username'] == username]
    df_accounts = df_accounts.reset_index(drop=True)
    name = df_accounts['display name'][0]
    followers = df_accounts['followers count'][0]
    friends = df_accounts['friends count'][0]
    page_age = df_accounts['user created date'][0][0:4]
    verified = df_accounts['is verified'][0]
    return tweets_shuffled, name, followers, friends, page_age, verified

# Function to send a message to the OpenAI chatbot model and return its response
def send_message(message_log):
    # Use OpenAI's ChatCompletion API to get the chatbot's response
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # The name of the OpenAI chatbot model to use
        messages=message_log,   # The conversation history up to this point, as a list of dictionaries
        max_tokens=3800,        # The maximum number of tokens (words or subwords) in the generated response
        stop=None,              # The stopping sequence for the generated response, if any (not used here)
        temperature=0.7,        # The "creativity" of the generated response (higher temperature = more creative)
    )

    # Find the first response from the chatbot that has text in it (some responses may not have text)
    for choice in response.choices:
        if "text" in choice:
            return choice.text

    # If no response with text is found, return the first response's content (which may be empty)
    return response.choices[0].message.content

def first_prompt(tweets, name, followers, friends, page_age, verified):
    len_tweets = len(tweets)
    prompts_list = []
    for i in range(20):
        random_list = []
        # Set a length of the list to 10
        for i in range(0, 4):
            # any random numbers from 0 to 1000
            random_list.append(random.randint(0, len_tweets-1))
        prompt = """write exactly two paragraph description for username {}: {} is a {} verified user in tweeter since {} and she has {} followers and {} friends,
she posts following tweets in her timeline:
t1 = {}
t2 = {}
t3 = {}
t4 = {}
""".format(name, name, verified, page_age, followers, friends, tweets[random_list[0]], tweets[random_list[1]], tweets[random_list[2]], tweets[random_list[3]])
        prompts_list.append(prompt)
    return prompts_list

def second_prompt(paragraph_list, mixed_numbers):
    len_paragraph = len(paragraph_list)
    prompts_list = []
    for i in range(mixed_numbers):
        random_list = []
        # Set a length of the list to 10
        for i in range(0, 2):
            # any random numbers from 0 to 1000
            random_list.append(random.randint(0, len_paragraph-1))
        prompt = """rewrite it more informative but summarized in human words without extra words:
s1 = {}
s2 = {}
""".format(paragraph_list[random_list[0]], paragraph_list[random_list[1]])
        prompts_list.append(prompt)
    return prompts_list

def paragraphs(prompts_list):
    # Initialize the conversation history with a message from the chatbot
    message_log = [
        {"role": "system", "content": "You are a helpful assistant."}
    ]


    summaries_list = []
    # Set a flag to keep track how many summaries we got
    summaries = 0

    # Start a loop that runs until prompts finished
    for prompts in prompts_list:
        message_log = [
        {"role": "system", "content": "You are a helpful assistant."}
    ]
        # if the len of the prompt is not higher than 1100 character we can make the summary because of tokens limitation
        if len(prompts) > 1100:
            print('pass')
            time.sleep(10)
            pass
        else :
            user_input = prompts
            print(len(prompts))
            message_log.append({"role": "user", "content": user_input})

            # Add a message from the chatbot to the conversation history
            message_log.append({"role": "assistant", "content": "You are a helpful assistant."})

            # Send the conversation history to the chatbot and get its response
            response = send_message(message_log)

            # Add the chatbot's response to the conversation history and print it to the console
            message_log.append({"role": "assistant", "content": response})
            summaries_list.append(response)
            summaries += 1
            # Adding sleep timer to prevend request limitation of API
            time.sleep(10)
    return summaries_list

# Ylecun

In [19]:
df = pd.read_csv("ylecun_owner_all.csv")
tweets, name, followers, friends, page_age, verified = information(df, 'ylecun')

  df_2['tweet_proc'] = df_2['Tweet'].str.replace(r'http(\S)+', r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'http ...', r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'@[\S]+',r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'_[\S]?',r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'&amp;?',r'and')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'[ ]{2, }',r' ')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')


First we create 20 prompts by mixing 4 random tweets to make initialized summaries and Improve in future

In [27]:
prompts_list = first_prompt(tweets, name, followers, friends, page_age, verified)

In [28]:
prompts_list[12]

'write exactly two paragraph description for username Yann LeCun: Yann LeCun is a False verified user in tweeter since 2009 and she has 434596 followers and 601 friends,\nshe posts following tweets in her timeline:\nt1 = scientific debates on social media are like a human form of bidirectional rlhf .\n the person making the post gets feedback ( good and bad ).\n the commenters also get feedback , mostly when they are clueless or wrong .\nt2 = convnets are a decent model of how the ventral pathway of the human visual cortex works .\n but llms don \' t seem to be a good model of how humans process language .\n there longer - term prediction taking place in the brain .\n awesome work by the brain - ai group at fair - paris .\nt3 = if using lisp as a front - end language to a deep learning system is " neuro - symbolic " then , i \' ve doing neuro - symbolic stuff since 1987\nt4 = from the horse \' s mouth .\n'

In [29]:
len(prompts_list[15])

1101

getting Initialized summaries

In [30]:
summaries_list = paragraphs(prompts_list)

1057
826
840
pass
796
pass
887
772
pass
942
690
pass
903
693
1099
pass
1038
1065
984
pass


Seperating Paragraph 1 and 2 to have 2 seperate paragraphs and handling OpenAI tokens limitation

In [31]:
paragraph_1_list = []
paragraph_2_list = []
for text in summaries_list:
    x = text.split("\n\n")
    paragraph_1 = x[0]
    paragraph_2 = x[1]
    paragraph_1_list.append(paragraph_1)
    paragraph_2_list.append(paragraph_2)

creating 10 prompts for each paragraph by mixing 2 random summarized texts we got from last part 
by doing this we got more detailed and robust summaries 

In [32]:
paragraph1 = second_prompt(paragraph_1_list, 10)
summaries_list_p1 = paragraphs(paragraph1)

pass
1030
940
980
pass
1011
1011
952
pass
963


In [33]:
paragraph2 = second_prompt(paragraph_2_list, 10)
summaries_list_p2 = paragraphs(paragraph2)

pass
pass
1056
1038
1047
1038
pass
1047
1001
996


now We get the smallest summaries for each paragraph, smallest one has less miss informations and AI irrelevant texts 

In [34]:
def p_min(paragraph_list):
    p = []
    min_len = 10000
    for s in paragraph_list:
        if len(s) < min_len:
            min_len = len(s)
            p = s 
        else:
            continue
    return p

In [35]:
p1 = p_min(summaries_list_p1)
p2 = p_min(summaries_list_p2)

In [36]:
promt = [""" 
write it as human two paragraph writing without any extra words:
{}
{}
""".format(p1, p2)]

In [38]:
final = paragraphs(promt)

770


In [39]:
final

["Yann LeCun is a renowned computer scientist and the Director of AI Research at Facebook. He is an expert in the field of artificial intelligence and regularly shares his knowledge with his 434,596 Twitter followers and 601 friends. Yann's tweets cover a range of topics related to AI, including machine learning and deep learning, and he keeps his followers up-to-date with the latest trends and developments in the field.\n\nYann is known for his candid opinions and has a large following on Twitter. He regularly hosts Q&A sessions and invites questions on AI and creativity. Yann's keynote speeches at conferences are highly regarded, and his technical insights and humorous observations on Twitter are closely followed by others in the AI community. Overall, Yann LeCun is a respected figure in the field of AI, and his contributions to the community are highly valued."]

# Taylor

In [40]:
df = pd.read_csv("taylor_owner_all.csv")
tweets, name, followers, friends, page_age, verified = information(df, 'TaylorLorenz')
prompts_list = first_prompt(tweets, name, followers, friends, page_age, verified)

  df_2['tweet_proc'] = df_2['Tweet'].str.replace(r'http(\S)+', r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'http ...', r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'@[\S]+',r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'_[\S]?',r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'&amp;?',r'and')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'[ ]{2, }',r' ')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')


In [41]:
summaries_list = paragraphs(prompts_list)

655
919
896
688
854
666
865
964
673
415
909
579
684
880
821
693
813
572
628
545


In [42]:
paragraph_1_list = []
paragraph_2_list = []
for text in summaries_list:
    x = text.split("\n\n")
    paragraph_1 = x[0]
    paragraph_2 = x[1]
    paragraph_1_list.append(paragraph_1)
    paragraph_2_list.append(paragraph_2)


In [45]:
paragraph1 = second_prompt(paragraph_1_list, 10)
summaries_list_p1 = paragraphs(paragraph1)
paragraph2 = second_prompt(paragraph_2_list, 10)
summaries_list_p2 = paragraphs(paragraph2)

957
735
943
918
685
1001
816
816
882
pass
1052
1053
pass
pass
pass
pass
pass
1026
1030
pass


In [46]:
p1 = p_min(summaries_list_p1)
p2 = p_min(summaries_list_p2)

In [50]:
p1

'Taylor Lorenz is a journalist who is well known for her writing on social media trends and the impact of technology on culture. She has a large Twitter following of over 350,000 users and has been a verified user since 2010. Lorenz currently works as a staff writer at The New York Times.'

In [51]:
p2

"Lorenz's tweets reflect her unique perspective and diverse interests. She's not afraid to discuss controversial topics or express her opinions on important issues. Her tweets are engaging and thought-provoking, whether she's talking about fashion or calling out unethical behavior in the media. Her followers appreciate her honesty and authenticity, and she's gained a reputation as a trusted voice in social media and journalism. Both sentences are the same."

In [58]:
promt = [""" 
write it as human exactly two paragraphs /n/n writing without any extra words:
{}
{}
""".format(p1, p2)]

In [61]:
final_taylor = paragraphs(promt)

830


In [62]:
final_taylor

["Taylor Lorenz is a journalist whose writing focuses on social media trends and the impact of technology on culture. She has a verified Twitter account with over 350,000 followers, and she has been active on the platform since 2010. Currently, Lorenz works as a staff writer for The New York Times.\n\nLorenz's tweets are engaging and cover a wide range of topics, from fashion to ethics in media. She is not afraid to express her opinions on controversial issues, and her followers appreciate her authenticity and honesty. Her unique perspective and thought-provoking tweets have earned her a reputation as a trusted voice in both social media and journalism."]

# Cathie

In [67]:
df = pd.read_csv("cathiedwood_owner_all.csv")
tweets, name, followers, friends, page_age, verified = information(df, 'CathieDWood')
prompts_list = first_prompt(tweets, name, followers, friends, page_age, verified)

  df_2['tweet_proc'] = df_2['Tweet'].str.replace(r'http(\S)+', r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'http ...', r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'@[\S]+',r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'_[\S]?',r'')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'&amp;?',r'and')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'[ ]{2, }',r' ')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
  df_2['tweet_proc'] = df_2['tweet_proc'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')


In [68]:
summaries_list = paragraphs(prompts_list)

1070
1135
pass
1079
pass
pass
pass
1113
1149
pass
pass
pass
947
pass
pass
pass
pass
996
pass
pass


In [69]:
paragraph_1_list = []
paragraph_2_list = []
for text in summaries_list:
    x = text.split("\n\n")
    paragraph_1 = x[0]
    paragraph_2 = x[1]
    paragraph_1_list.append(paragraph_1)
    paragraph_2_list.append(paragraph_2)


In [70]:
paragraph1 = second_prompt(paragraph_1_list, 10)
summaries_list_p1 = paragraphs(paragraph1)


959
1041
1041
pass
1019
pass
1027
1019
1027
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass


In [83]:
paragraph2 = second_prompt(paragraph_2_list, 10)
summaries_list_p2 = paragraphs(paragraph2)

1237
1127
1288
pass
pass
pass
pass
pass
pass
1127


In [84]:
p1 = p_min(summaries_list_p1)
p2 = p_min(summaries_list_p2)

In [85]:
p1

'Cathie Wood is a famous finance and investment expert. She created a company called ARK Invest that invests in companies that lead in technology innovation. She has a lot of followers on Twitter, where she shares her investment strategies and market insights. People look up to her for guidance on how to invest in the financial market.'

In [86]:
p2

'Cathie Wood shares her knowledge on finance and technology trends via Twitter. She focuses on emerging technologies with the potential to transform industries, such as digital TV advertising. She also discusses the impact of innovation on economic theories like Keynesian economics. Her large Twitter following makes her a valuable source of information for investors and industry professionals.'

In [87]:
promt = [""" 
write it as human exactly two paragraphs /n/n writing without any extra words:
{}
{}
""".format(p1, p2)]

In [116]:
final_cathie = paragraphs(promt)

814


In [117]:
final_cathie

['Cathie Wood is a well-known finance and investment expert. She is the founder of ARK Invest, a company that invests in innovative technology companies. Cathie has a significant following on Twitter, where she shares her investment strategies and market insights. Her followers look up to her for advice on how to invest in the financial market, especially in emerging technologies that can transform industries.\n\nThrough her Twitter account, Cathie shares her knowledge on finance and technology trends. She concentrates on emerging technologies that have the potential to revolutionize industries, like digital TV advertising. Additionally, she discusses the impact of innovation on economic theories such as Keynesian economics. Due to her large Twitter following, Cathie is a valuable source of information for investors and industry professionals alike.']

In [107]:
final_cathie = final_cathie[0]
final = final[0]
final_taylor = final_taylor[0]

In [122]:
d = {'Accounts': ['ylecun', 'TaylorLorenz', 'CathieDWood'], 'Description': [final, final_taylor, final_cathie]}
df_descriptions = pd.DataFrame(data=d)

In [123]:
df_descriptions.to_csv("Descriptions.csv", index=False)

In [135]:
print(df_descriptions['Description'][0])

Yann LeCun is a renowned computer scientist and the Director of AI Research at Facebook. He is an expert in the field of artificial intelligence and regularly shares his knowledge with his 434,596 Twitter followers and 601 friends. Yann's tweets cover a range of topics related to AI, including machine learning and deep learning, and he keeps his followers up-to-date with the latest trends and developments in the field.

Yann is known for his candid opinions and has a large following on Twitter. He regularly hosts Q&A sessions and invites questions on AI and creativity. Yann's keynote speeches at conferences are highly regarded, and his technical insights and humorous observations on Twitter are closely followed by others in the AI community. Overall, Yann LeCun is a respected figure in the field of AI, and his contributions to the community are highly valued.


In [132]:
print(df_descriptions['Description'][1])

Taylor Lorenz is a journalist whose writing focuses on social media trends and the impact of technology on culture. She has a verified Twitter account with over 350,000 followers, and she has been active on the platform since 2010. Currently, Lorenz works as a staff writer for The New York Times.

Lorenz's tweets are engaging and cover a wide range of topics, from fashion to ethics in media. She is not afraid to express her opinions on controversial issues, and her followers appreciate her authenticity and honesty. Her unique perspective and thought-provoking tweets have earned her a reputation as a trusted voice in both social media and journalism.


In [133]:
print(df_descriptions['Description'][2])

Cathie Wood is a well-known finance and investment expert. She is the founder of ARK Invest, a company that invests in innovative technology companies. Cathie has a significant following on Twitter, where she shares her investment strategies and market insights. Her followers look up to her for advice on how to invest in the financial market, especially in emerging technologies that can transform industries.

Through her Twitter account, Cathie shares her knowledge on finance and technology trends. She concentrates on emerging technologies that have the potential to revolutionize industries, like digital TV advertising. Additionally, she discusses the impact of innovation on economic theories such as Keynesian economics. Due to her large Twitter following, Cathie is a valuable source of information for investors and industry professionals alike.
