## Linkedin Setup

In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import os
import PyPDF2
from google.cloud import storage

# Set the environment variable to point to your service account key file
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/local-key.json"

# Function to extract text from the LinkedIn resume PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Extract text from your LinkedIn resume
pdf_path = '/content/resume.pdf'
text = extract_text_from_pdf(pdf_path)

# Store extracted text in Cloud Storage
client = storage.Client()
bucket = client.get_bucket('char-bot')
blob = bucket.blob('linkedin_resume_text.txt')
blob.upload_from_string(text)

print("Resume text stored in Cloud Storage")


Resume text stored in Cloud Storage


## Google Docs setup

In [None]:
!pip install --upgrade google-auth-oauthlib

In [None]:
from google_auth_oauthlib.flow import InstalledAppFlow
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import os
import re
import pandas as pd
from google.cloud import storage

# Load credentials and create a Google Docs & Drive service
def google_services():
    SCOPES = ['https://www.googleapis.com/auth/documents.readonly', 'https://www.googleapis.com/auth/drive.readonly']

    creds = None
    # Load the token.json file (OAuth token)
    if os.path.exists('/content/token.json'):
        creds = Credentials.from_authorized_user_file('/content/token.json', SCOPES)
    else:
        print("Token.json missing. Please authenticate.")

    docs_service = build('docs', 'v1', credentials=creds)
    drive_service = build('drive', 'v3', credentials=creds)

    return docs_service, drive_service


# Get list of all Google Docs
def list_google_docs(drive_service):
    try:
        results = drive_service.files().list(
            q="mimeType='application/vnd.google-apps.document'",
            fields="nextPageToken, files(id, name)").execute()
        docs = results.get('files', [])
        return docs
    except HttpError as error:
        print(f"An error occurred: {error}")
        return []


# Extract content from a Google Doc by document ID
def extract_google_doc_content(docs_service, doc_id):
    try:
        document = docs_service.documents().get(documentId=doc_id).execute()
        doc_content = document.get('body').get('content')
        text = ''
        for element in doc_content:
            if 'paragraph' in element:
                for paragraph_element in element['paragraph']['elements']:
                    if 'textRun' in paragraph_element:
                        text += paragraph_element['textRun']['content']
        return text
    except HttpError as error:
        print(f"An error occurred: {error}")
        return None


# Clean and preprocess text (removes non-alphanumeric characters and excessive spaces)
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove all special characters
    return text.strip()


# Structuring the data
def structure_data(doc_content, doc_id, doc_name):
    structured_data = pd.DataFrame({
        'Document_ID': [doc_id],
        'Document_name': [doc_name],
        'Content': [clean_text(doc_content)],
        'Source_Tag': [doc_name]
    })
    return structured_data


# Store cleaned text in Cloud Storage
def store_in_cloud_storage(doc_name, text, bucket_name='char-bot'):
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(f"{doc_name}.txt")
    blob.upload_from_string(text)
    print(f"Stored {doc_name}.txt in Cloud Storage")


# Main function to execute the process
def main():
    docs_service, drive_service = google_services()

    # Get the list of all Google Docs
    docs = list_google_docs(drive_service)

    if not docs:
        print("No documents found.")
        return

    # Iterate through each document, extract content, clean it, structure it, and store it
    for doc in docs:
        doc_id = doc['id']
        doc_name = doc['name']

        # Extract content
        doc_content = extract_google_doc_content(docs_service, doc_id)
        if doc_content is None:
            continue

        # Clean and structure the data
        structured_data = structure_data(doc_content, doc_id, doc_name)

        # Store in Cloud Storage
        store_in_cloud_storage(doc_name, structured_data['Content'].iloc[0])

if __name__ == '__main__':
    main()


Stored Home Depot.txt in Cloud Storage
Stored My Girl.txt in Cloud Storage
Stored Behavioural-Teja.txt in Cloud Storage
Stored Note.txt in Cloud Storage
Stored Weekend Projects.txt in Cloud Storage
Stored Microsoft.txt in Cloud Storage
Stored Business.txt in Cloud Storage
Stored GSU.txt in Cloud Storage
Stored Courses.txt in Cloud Storage
Stored Full Time Job.txt in Cloud Storage
Stored Useful.txt in Cloud Storage
Stored Time.txt in Cloud Storage
Stored Work TKE.txt in Cloud Storage
Stored Books.txt in Cloud Storage
Stored Stock Market.txt in Cloud Storage
Stored Me.txt in Cloud Storage
Stored Positive Excuse.txt in Cloud Storage
Stored Data Science.txt in Cloud Storage
Stored Reach Out.txt in Cloud Storage
Stored Biographies.txt in Cloud Storage
Stored Can Do It.txt in Cloud Storage
Stored Self Authoring.txt in Cloud Storage
Stored Public speaking.txt in Cloud Storage
Stored US Life.txt in Cloud Storage
Stored Explore US.txt in Cloud Storage
Stored Money .txt in Cloud Storage
Stored L

### Youtube Setup

In [None]:
!pip install youtube-transcript-api

In [None]:
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
from google.cloud import storage
import os

# Initialize the YouTube API client
def youtube_service(api_key):
    return build('youtube', 'v3', developerKey=api_key)

# Get list of all videos in a YouTube channel
def list_youtube_videos(api_key, channel_id):
    service = youtube_service(api_key)
    request = service.search().list(
        part='snippet',
        channelId=channel_id,
        maxResults=50,  # Adjust as needed
        type='video'
    )
    response = request.execute()
    video_list = [{'id': item['id']['videoId'], 'title': item['snippet']['title']} for item in response['items']]
    return video_list

# Extract video description using video ID
def extract_youtube_video_description(api_key, video_id):
    service = youtube_service(api_key)
    response = service.videos().list(part='snippet', id=video_id).execute()
    video_description = response['items'][0]['snippet']['description']
    return video_description

# Get video transcript using video ID
def get_youtube_video_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_text = ' '.join([item['text'] for item in transcript])
        return transcript_text
    except Exception as e:
        print(f"Transcript not available for video {video_id}: {e}")
        return None

# Clean text (removes excessive spaces and special characters)
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove all special characters
    return text.strip()

# Store text file in Google Cloud Storage
def store_in_cloud_storage(file_name, content, bucket_name='char-bot'):
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(f"{file_name}.txt")
    blob.upload_from_string(content)
    print(f"Stored {file_name}.txt in Cloud Storage")

# Main function to list videos, extract descriptions, transcripts, and store them
def main(api_key, channel_id):
    # List all videos in the channel
    videos = list_youtube_videos(api_key, channel_id)

    if not videos:
        print("No videos found.")
        return

    # Iterate over each video, extract description and transcript
    for video in videos:
        video_id = video['id']
        video_title = video['title']

        # Extract description
        description = extract_youtube_video_description(api_key, video_id)

        # Get transcript
        transcript = get_youtube_video_transcript(video_id)

        # Combine description and transcript
        content = f"Title: {video_title}\n\nDescription: {description}\n\nTranscript: {transcript or 'Transcript not available'}"

        # Clean the content
        cleaned_content = clean_text(content)

        # Store the cleaned content in Cloud Storage
        store_in_cloud_storage(video_title, cleaned_content)

if __name__ == '__main__':
    api_key = 'AIzaSyCuxBq24ZnhLYMRC3KUCIyqCcfYthfGGZg'  # Replace with your YouTube API Key
    channel_id = 'UCkStyeYHbmAftDDeyK0ifyg'    # Replace with your YouTube channel ID
    main(api_key, channel_id)


Stored Bajaj Boxer 1999 | First Gear Challenge.txt in Cloud Storage
Transcript not available for video nF-tvAD1tJc: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=nF-tvAD1tJc! This is most likely caused by:

Subtitles are disabled for this video

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!
Stored Bangalore Night Time lapse | Tata Promote | Nice road.txt in Cloud Storage
Stored Break the Beard panga | Sports men shows his style | Break the beard Challenge.txt in Cloud Storage
Transcript not available for video bm-L61KY2DQ: 
Could not retrieve a transcript for the video https://www.youtube.com/wat

## Knowledge base

In [None]:
from google.cloud import storage
import os

# Set environment variable for authentication (replace with your service account path)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/local-key.json"

# Initialize the client
client = storage.Client()

# Specify your bucket name
bucket_name = 'char-bot'
bucket = client.get_bucket(bucket_name)

# Get all text files from the bucket
blobs = bucket.list_blobs()
text_files = []

for blob in blobs:
    if blob.name.endswith('.txt'):
        text_files.append(blob.name)

print(f'Found {len(text_files)} text files.')


Found 112 text files.


# Data to Dialogflow CX setup

### Create a master text file combining all the text files.



In [None]:
master_text = ""

for i, file in enumerate(text_files):
    blob = bucket.blob(file)
    content = blob.download_as_text()

    # Append file content with an index
    master_text += f"### Document {i+1}: {file} ###\n{content}\n\n"

# Save the merged content to a new file
with open("merged_file.txt", "w") as f:
    f.write(master_text)

print("All text files merged with indexing.")


All text files merged with indexing.


In [None]:
blob = bucket.blob('/content/merged_file.txt')
blob.upload_from_filename('merged_file.txt')

print("Merged file uploaded to GCS.")


Merged file uploaded to GCS.


### Fixed factual data

In [None]:
### Fixed answers give it to data store.

import pandas as pd

# Create a list of dictionaries, where each dictionary represents a row in the CSV
data = [
    {"question": "Why is the sky blue?", "answer": "The sky is blue because of Rayleigh scattering.", "title": "Rayleigh scattering", "url": "https://en.wikipedia.org/wiki/Rayleigh_scattering"},
    {"question": "What is charan's age?", "answer": "24", "title": "About", "url": ""},
    {"question": "What is charan's height?", "answer": "5 feet 11 inches", "title": "About", "url": ""},
    {"question": "What is charan's weight?", "answer": "As of October 2024, its 72kg", "title": "About", "url": ""},
    {"question": "What is charan's Date of birth?", "answer": "29 October 1999", "title": "About", "url": ""},
]

# Create a Pandas DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_filename = "faq.csv"
df.to_csv(csv_filename, index=False)  # index=False prevents writing row numbers

print(f"FAQ data saved to {csv_filename}")

FAQ data saved to faq.csv


## Not used

In [None]:
!pip install google-cloud-storage google-cloud-dialogflow google-cloud-dialogflow-cx


In [None]:
from google.cloud import storage
import json

# Constants
BUCKET_NAME = 'char-bot'

# Initialize Google Cloud Storage client
def list_blob_files(bucket_name):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blobs = bucket.list_blobs()
    return blobs

# Read content from each blob (text file) in 'char-bot' storage bucket
def read_text_from_blob(bucket_name, blob_name):
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)
    content = blob.download_as_text()
    return content

# Iterate through blobs and extract content
def process_text_files():
    blobs = list_blob_files(BUCKET_NAME)
    all_text_files = []
    for blob in blobs:
        content = read_text_from_blob(BUCKET_NAME, blob.name)
        all_text_files.append({"file_name": blob.name, "content": content})
    return all_text_files

# Preprocess text files and create knowledge base
def create_knowledge_base(all_text_files):
    knowledge_base = {"intents": []}

    for file in all_text_files:
        file_name = file["file_name"]
        content = file["content"].strip()

        # Split content into training phrases
        training_phrases = content.splitlines()  # Split content into lines as training phrases
        responses = [content]  # Use the whole content as a response

        # Create intent structure
        intent = {
            "name": file_name,
            "training_phrases": training_phrases,
            "responses": responses
        }

        knowledge_base["intents"].append(intent)

    return knowledge_base

# Save knowledge base to JSON file locally
def save_knowledge_base_to_json(knowledge_base, filename='knowledge_base.json'):
    with open(filename, 'w') as json_file:
        json.dump(knowledge_base, json_file, indent=4)

# Upload JSON file to Google Cloud Storage
def upload_to_gcs(filename, bucket_name):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(filename)

    # Upload the file
    blob.upload_from_filename(filename)
    print(f"{filename} has been uploaded to {bucket_name}.")

# Main function to process files and create knowledge base
def main():
    all_text_files = process_text_files()
    knowledge_base = create_knowledge_base(all_text_files)

    local_filename = 'knowledge_base.json'
    save_knowledge_base_to_json(knowledge_base, local_filename)

    # Upload to Google Cloud Storage
    upload_to_gcs(local_filename, BUCKET_NAME)

if __name__ == "__main__":
    main()


In [None]:
import os
from google.cloud import dialogflowcx_v3beta1 as dialogflow_cx

# Set environment variable for authentication (replace with your service account path)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/local-key.json"

# Project and agent setup
project_id = "local-incline-323214"
location = "us-central1"  # e.g., 'us-central1'
agent_id = "cf4a34b6-afd7-427b-a273-acbb648a77c5"
agent_path = f'projects/{project_id}/locations/{location}/agents/{agent_id}'

# Initialize Dialogflow CX client, specifying the location
agents_client = dialogflow_cx.AgentsClient(client_options={"api_endpoint": f"{location}-dialogflow.googleapis.com"})
intents_client = dialogflow_cx.IntentsClient(client_options={"api_endpoint": f"{location}-dialogflow.googleapis.com"})

# Function to create intents with training phrases and responses
def create_intent(display_name, training_phrases, response_messages):
    parent = agents_client.agent_path(project_id, location, agent_id)

    # Create training phrases list
    training_phrases_obj = []
    for phrase in training_phrases:
      # Use dialogflow_cx.Intent.TrainingPhrase.Part to define individual parts
        part = dialogflow_cx.Intent.TrainingPhrase.Part(text=phrase)

      # Append a TrainingPhrase with a list containing the Part object
        training_phrases_obj.append(dialogflow_cx.Intent.TrainingPhrase(parts=[part], repeat_count=1))


    # Create response message list
    response_messages_obj = []
    for response in response_messages:
        # Corrected: Create a ResponseMessage object with text
        response_message = dialogflow_cx.ResponseMessage(text=dialogflow_cx.ResponseMessage.Text(text=[response]))
        response_messages_obj.append(response_message)

    # Define intent object
    intent = dialogflow_cx.Intent(
        display_name=display_name,
        training_phrases=training_phrases_obj,
    )

    # Create intent in Dialogflow CX
    response = intents_client.create_intent(request={"parent": parent, "intent": intent})
    print(f"Intent created: {response.name}")

# Define intents with multiple training phrases and responses
intents_data = [
    {
        "display_name": "GetName",
        "training_phrases": [
            "What is your name?",
            "Who are you?",
            "Can you tell me your name?"
        ],
        "responses": [
            "My name is ChatGPT.",
            "I am ChatGPT, your virtual assistant."
        ]
    },
    {
        "display_name": "Help",
        "training_phrases": [
            "How can you help me?",
            "I need assistance.",
            "Can you help me with my account?"
        ],
        "responses": [
            "I can assist you with account-related issues.",
            "I’m here to help you with any questions or concerns."
        ]
    },
    {
        "display_name": "Greeting",
        "training_phrases": [
            "Hello",
            "Hi",
            "Hey there"
        ],
        "responses": [
            "Hi! What would you like to explore about Charan today?",
            "Hello! How can I assist you today?"
        ]
    },
    {
        "display_name": "Goodbye",
        "training_phrases": [
            "Goodbye",
            "See you later",
            "Bye"
        ],
        "responses": [
            "Goodbye! Have a great day!",
            "See you later! Take care!"
        ]
    }
]

# Create each intent in Dialogflow CX
for intent_data in intents_data:
    create_intent(
        display_name=intent_data["display_name"],
        training_phrases=intent_data["training_phrases"],
        response_messages=intent_data["responses"]
    )
