In [2]:
import pandas as pd
from google.cloud import storage
from google.cloud import storage
import pandas as pd
from io import BytesIO
from google.cloud import bigquery
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import RobustScaler

In [5]:

# Your GCS bucket name
bucket_name = 'ccai-storage'

# Paths to the Parquet files in your GCS bucket
articles_parquet_path = 'make_prediction/test_file.parquet'

# Initialize the GCS client
client = storage.Client()

# Function to download a Parquet file from GCS and load it into a pandas DataFrame
def load_parquet_from_gcs(bucket_name, file_path):
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(file_path)
    byte_stream = BytesIO()
    blob.download_to_file(byte_stream)
    byte_stream.seek(0)
    df = pd.read_parquet(byte_stream, engine='pyarrow')
    return df

# Load the Parquet files
articles_df = load_parquet_from_gcs(bucket_name, articles_parquet_path)

In [13]:
# Iterate through each column
random_row = articles_df.sample()

for col in random_row.columns:
    print(f"Column: {col}")
    # Iterate through each row in the column
    for val in random_row[col]:
        print(val)
    print("\n")  # Add a newline after each column


Column: uri
7892121222


Column: url
https://www.ilfattoquotidiano.it/2023/12/18/auto-si-schianta-contro-suv-della-scorta-di-biden-il-tonfo-e-la-sorpresa-del-presidente-video/7386540/


Column: title
Auto si schianta contro suv della scorta di Biden: il tonfo e la sorpresa del presidente - Video - Il Fatto Quotidiano


Column: body
Al momento dell'impatto di un'auto contro un suv del corteo presidenziale, Joe Biden stava uscendo da un evento elettorale al quartier generale della campagna per il 2024 a Wilmington. Lo riferisce la Casa Bianca in una nota. Nel momento in cui l'auto, una berlina argentata, ha colpito un suv del corteo gli uomini del Secret Service l'hanno circondata con le pistole puntate e il conducente ha alzato le mani. Il presidente ha assistito alla scena con un'espressione sorpresa e poi è stato scortato alla sua auto e quindi nella sua residenza in Delaware, riferiscono i giornalisti al seguito.


Column: date
2023-12-18


Column: time
08:47:11


Column: dateTime
20

In [1]:
import logging

class DataProcessor:
    def __init__(self, bucket: str, folder: str, num_doc: int, random_seed: int):
        self.bucket = bucket
        self.folder = folder
        self.num_doc = num_doc
        self.random_seed = random_seed
        logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s\n')
        
    def data_preprocessing(self, input_path: str) -> str:
        parquet_file_name = input_path
        processor = GCSParquetLoader(self.bucket, self.folder, input_path, parquet_file_name, self.num_doc, self.random_seed)
        return processor.process()
    
    def data_tokenization(self, input_path: str) -> str:
        file_path = input_path
        parquet_file_name = file_path
        processor = TokenizationProcessor(self.bucket, file_path, self.folder, parquet_file_name)
        return processor.process()
    
    def data_sentiment(self, input_path: str, text_column: str) -> str:
        file_path = input_path
        parquet_file_name = file_path
        processor = GCSSentimentAnalyzer(self.bucket, file_path, self.folder, parquet_file_name, text_column, self.num_doc)
        return processor.process()
    
    def data_moderate(self, input_path: str, text_column: str) -> str:
        file_path = input_path
        parquet_file_name = file_path
        processor = GCSTextModerationLoader(self.bucket, file_path, self.folder, parquet_file_name, text_column, self.num_doc)
        return processor.process()
    
    def data_entities(self, input_path: str, text_column: str) -> str:
        file_path = input_path
        parquet_file_name = file_path
        processor = GCSCEntityAnalyzer(self.bucket, file_path, self.folder, parquet_file_name, text_column, self.num_doc)
        return processor.process()
    
    def data_bigquery(self, input_path: str, project_id: str, dataname: str, table_id: str, location: str) -> str:
        file_path = input_path
        parquet_file_name = file_path
        processor = GCS_Bigquery(self.bucket, file_path, self.folder, parquet_file_name, project_id, dataname, table_id, location)
        return processor.upload_dataframe_to_bigquery()


In [2]:
from pipeline_preprocessing import DataProcessor

import kfp
from kfp import compiler
from kfp.dsl import component, pipeline, Artifact, ClassificationMetrics, Input, Output, Model, Metrics

from google.cloud import aiplatform as aip
from typing import NamedTuple

from datetime import datetime

#import kfp.v2.dsl as dsl
#import google_cloud_pipeline_components as gcc_aip
from google_cloud_pipeline_components.v1.dataset import TabularDatasetCreateOp
from google_cloud_pipeline_components.v1.automl.training_job import AutoMLTabularTrainingJobRunOp
from google_cloud_pipeline_components.v1.endpoint import EndpointCreateOp, ModelDeployOp

from google.cloud import bigquery 
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value
import json
import numpy as np


PROJECT_ID = 'gcp-ccai-auto-ml-contactcenter'
REGION= "europe-west3"
REPO_NAME = "repo-demo3"
SERVICE_ACCOUNT = "944308723981-compute@developer.gserviceaccount.com"
BUCKET = "ccai-storage"
PIPELINE_NAME = "automl_pipeline"
YAML_NAME = f"{PIPELINE_NAME}.yml"
PIPELINE_ROOT = f"gs://{BUCKET}/pipeline_root/"
DISPLAY_NAME = PIPELINE_NAME.replace("_", "-")
NOTEBOOK = "automl"
DATANAME = "datasetnlp"
FILE_PATH = 'test_file.parquet'
FOLDER = 'make_prediction'
PROJECT_ID = 'gcp-ccai-auto-ml-contactcenter'
TABLE_ID = "testdatabq"
TEXT_COLUMN = 'body_pre'
LOCATION = "europe-west3"
NUM_DOC = 20
RANDOM_SEED=123
#BQ_SOURCE = "bq://gcp-ccai-auto-ml-contactcenter.datasetnlp.stepfinalbq"
OUTPUT_PROCESSING = 'output_processing.parquet'
OUTPUT_TOKENIZATION = 'output_tokenized.parquet'
OUTPUT_SENTIMENT = 'output_sentiment.parquet'
OUTPUT_MODERATE = 'output_moderate.parquet'
OUTPUT_ENTITIES = 'output_entities.parquet'
OUTPUT_FINAL = 'step_final_bq.parquet'

# Resources
DEPLOY_COMPUTE = 'n1-standard-4'

aip.init(project=PROJECT_ID, staging_bucket=PIPELINE_ROOT, location=REGION)
bq = bigquery.Client()

# Initialize DataProcessor object
data_processor = DataProcessor(bucket=BUCKET, folder=FOLDER, num_doc=NUM_DOC, random_seed=RANDOM_SEED)

In [3]:
bigquery_upload_status = data_processor.data_bigquery(file_path=entity_data_path, 
                                                      parquet_file_name=OUTPUT_FINAL, 
                                                      project_id=PROJECT_ID, 
                                                      dataname=DATANAME, 
                                                      table_id=TABLE_ID, 
                                                      location=LOCATION)
print("BigQuery Upload completed. Status:", bigquery_upload_status)

NameError: name 'entity_data_path' is not defined