In [None]:
from loaders.emailTextLoader import get_emails
from loaders.ytbChannelLoader import get_videos

In [None]:
# email_corpus = get_emails()
# video_corpus = get_videos()

with open('test/day_mail_corpus.txt', 'r') as f:
    email_corpus = f.read()

with open('test/day_video_corpus.txt', 'r') as f:
    video_corpus = f.read()

In [None]:
len(video_corpus)

In [None]:
len(email_corpus)

In [None]:
# with open('test/day_mail_corpus.txt', 'w') as f:
#     f.write(email_corpus)

# with open('test/day_video_corpus.txt', 'w') as f:
#     f.write(video_corpus)

In [None]:
import re

def remove_emojis(text):

    # Define a regular expression pattern to match emojis
    emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # Emoticons
                            u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                            u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                            u"\U0001F700-\U0001F77F"  # Alchemical Symbols
                            u"\U0001F780-\U0001F7FF"  # Geometric Shapes
                            u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                            u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                            u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                            u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                            u"\U0001F004-\U0001F0CF"  # CJK Compatibility Ideographs
                            u"\U0001F170-\U0001F251"  # Enclosed Ideographic Supplement
                            "]+", flags=re.UNICODE)

    # Remove emojis from the text
    text_without_emojis = re.sub(emoji_pattern, '', text)

    return text_without_emojis


In [None]:
len(remove_emojis(email_corpus))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# Example preprocessed text data
corpus = email_corpus.split('.')

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Apply K-Means clustering
k = 75  # Number of clusters (you can experiment with different values)
kmeans = KMeans(n_clusters=k)
clusters = kmeans.fit_predict(X)

# Visualize the clusters using PCA for dimensionality reduction
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())

# Scatter plot of clusters
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis')
plt.title(f'K-Means Clustering (k={k})')
plt.show()

# Create a dictionary to store representatives for each cluster
cluster_representatives = {}

# Iterate through data points and assign them to clusters
for i, label in enumerate(clusters):
    if label not in cluster_representatives:
        cluster_representatives[label] = []
    cluster_representatives[label].append(i)

# Choose representatives (original text) for each cluster
cluster_texts = {}
for label, data_point_indices in cluster_representatives.items():
    cluster_text = [corpus[i] for i in data_point_indices]
    cluster_texts[label] = cluster_text


In [None]:
for label, text in cluster_texts.items():
    print(f'Cluster {label}:')
    print(text)

In [None]:
import nltk
from nltk.stem import PorterStemmer

def stemming(corpus):
    # Download the NLTK data (if not already downloaded)
    nltk.download('punkt')

    # Initialize the Porter Stemmer
    stemmer = PorterStemmer()

    # Tokenize the text into words
    words = nltk.word_tokenize(corpus)

    # Apply stemming to each word
    stemmed_words = [stemmer.stem(word) for word in words]

    # Join the stemmed words back into a sentence
    stemmed_text = ' '.join(stemmed_words)

    print(len(stemmed_text))

    return stemmed_text


In [None]:
import spacy

def lemma(corpus):
    nlp = spacy.load("en_core_web_sm")

    doc = nlp(corpus)

    lemmatized_text = ' '.join([token.lemma_ for token in doc])

    print(len(lemmatized_text))

    return lemmatized_text

In [None]:
import nltk
from nltk.corpus import stopwords
import string

def remove_stop(corpus):
    nltk.download('stopwords')

    words = nltk.word_tokenize(corpus)

    table = str.maketrans('', '', string.punctuation)
    stripped_words = [word.translate(table) for word in words]

    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in stripped_words if word.lower() not in stop_words]

    filtered_text = ' '.join(filtered_words)

    print(len(filtered_text))

    return filtered_text

In [None]:
stemming(email_corpus)
lemma(email_corpus)
remove_stop(email_corpus)

In [None]:
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# Tokenize the text into sentences
sentences = sent_tokenize(email_corpus)

# Calculate TF-IDF scores for words in sentences
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

# Calculate sentence importance scores based on TF-IDF
sentence_scores = tfidf_matrix.sum(axis=1)

# Select the top N sentences
num_sentences = 10  # Adjust this based on your desired summary length
selected_sentences = []
for i in range(num_sentences):
    max_score_index = sentence_scores.argmax()
    selected_sentences.append(sentences[max_score_index])
    sentence_scores[max_score_index] = 0  # Mark the selected sentence as visited

# Reconstruct the summary
summary = ' '.join(selected_sentences)
print(len(summary))

In [None]:
!pip install bardapi python-dotenv transformers sentence-transformers

In [None]:
from typing import Any, List, Mapping, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from bardapi import BardCookies
from dotenv import load_dotenv

class BardRAPI(LLM):
    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
    ) -> str:
        load_dotenv()
        print(prompt)
        bard = BardCookies(token_from_browser=True)
        return bard.get_answer(prompt)['content']

In [None]:
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
def summarize_text(corpora):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000
    )
    texts = text_splitter.split_text(corpora)
    docs = [Document(page_content=t) for t in texts]

    prompt_template = """You are given a text about various news happening in the space of artifical intelligence, machine learning or data science.
    Your job is to summarize the most important news from the text, with emphasis on news around large language models, and the tools that are used to handle them.
    Don't worry about the length of the summary, just make sure it is coherent and covers the most important points, and also does not skimp on details.


    {text}

    CONCISE SUMMARY: """

    gptlm = BardRAPI()
    PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
    chain = load_summarize_chain(gptlm, chain_type="map_reduce", map_prompt=PROMPT, verbose=True)
    
    print(chain.llm_chain.prompt.template)
    print(chain.combine_document_chain.llm_chain.prompt.template)

    output_summary = chain.run(docs)
    return output_summary

In [None]:
response = summarize_text(remove_stop(email_corpus))

In [None]:
print(response)

In [None]:
from bardapi import BardCookies
from datetime import datetime

bard = BardCookies(token_from_browser=True)
audio = bard.speech(response)

In [None]:

today_str = datetime.today().strftime("%Y-%m-%d")

with open(f"summaries/audio/summary_{today_str}.ogg", "wb") as f:
  f.write(bytes(audio['audio']))

with open(f"summaries/text/summary_{today_str}.txt", "w") as f:
  f.write(response)

print(response)

from IPython.display import Audio
Audio(f'summaries/text/summary_{today_str}.ogg')


In [None]:
import subprocess

subprocess.run("git add summaries/text", shell=True)
subprocess.run("git add summaries/audio", shell=True)

subprocess.run(f"git commit -m 'Summary for {today_str}'", shell=True)
subprocess.run("git push origin master", shell=True)