### SEED DATABASE
This notebook is a one-time workflow to bring together various data analysis processes across prototypes and apply them to the initial 120 images that constitute the basic Captious Memes dataset.  
  
Not the official workflow for introducing new images to the database, but basically through this notebook I'm establishing the standard by which those workflows will be built.

In [6]:
#database
import firebase_admin
from firebase_admin import credentials
from firebase_admin import db
#data processing
import pandas as pd
# image annotation
import os
from google.cloud import vision
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./google_credentials.json"
transcript_client = vision.ImageAnnotatorClient()

In [2]:
# connect to firebase database
cred = credentials.Certificate('./service_acct_key.json')
firebase_admin.initialize_app(cred, {
  'databaseURL': 'https://meme-room-default-rtdb.firebaseio.com/'
})

<firebase_admin.App at 0x7fd2b831cd00>

In [3]:
#import reviewed images
image_data = db.reference('/reviewed-images').get()

### prepare list of final images relevant images

In [4]:
# prepare list of final images filtered to be relevant
def filter_relevant_images(image_df):

    image_dict = image_df["labels"].to_dict()
    relevant_images = []

    for i, v in image_dict.items():
        try:
            if (v["relevance"] == "relevant"):
                row = image_df.iloc[i].to_dict()
                relevant_images.append(row)
        except TypeError:
            print("no labels found")

    return relevant_images

In [5]:
image_df = pd.DataFrame(list(image_data.values()))
relevant_images = filter_relevant_images(image_df)

In [7]:
def transcribe_images(images):

  img_db = images.copy()

  for img_data in img_db:
    url = img_data['image_url']
    img_data['image_transcript'] = transcribe_image(url)
  return img_db
    


def transcribe_image(url):
  try:
    response = transcript_client.annotate_image({
      'image': {'source': {'image_uri': url}},
      'features': [{'type_': vision.Feature.Type.TEXT_DETECTION}]
    })

    transcript = response.text_annotations[0].description
    return transcript

  except:
    msg = "no transcript"
    return msg
    
    

In [8]:
image_dict_transcripts = transcribe_images(relevant_images)

In [11]:
# flatten image labels
def flatten_image_labels(image_data):
  image_data_with_flat_labels = [];
  for post_data in image_data:
    pd = post_data.copy()
    labels = pd['labels']
    try:
      pd['symbols'] = labels['characters']
    except:
      pd['symbols'] = []
    try:
      pd['content_warning_tags'] = labels['content_warning_tags']
    except:
      pd['content_warning_tags'] = ""
    try:
      pd['meta_tags'] = labels['meta_tags']
    except:
      pd['meta_tags'] = {}
    image_data_with_flat_labels.append(pd)
  return image_data_with_flat_labels

In [12]:
image_dict_flat_labels = flatten_image_labels(image_dict_transcripts)

In [13]:
#clean up data so far and get it ready for final processing + assembling
idf = pd.DataFrame(image_dict_flat_labels)
idf.pop('idx')
idf.pop('idx_2')
idf.pop('labels')
idf.pop('category')
idf['post_author'] = idf['author']
idf.pop('author')
idf['post_title'] = idf['title']
idf.pop('title')
idf['post_id'] = idf['reddit_id']
idf.pop('reddit_id')
idf['post_url'] = idf['permalink']
idf.pop('permalink')
idf['post_num_comments'] = idf['num_comments']
idf['post_num_upvotes'] = idf['num_upvotes']
idf['post_upvote_ratio'] = idf['upvote_ratio']
idf.pop('num_comments')
idf.pop('num_upvotes')
idf.pop('upvote_ratio')

0      0.97
1      0.95
2      0.99
3       1.0
4      0.97
       ... 
102    0.94
103    0.98
104    0.97
105    0.96
106    0.99
Name: upvote_ratio, Length: 107, dtype: object

In [48]:
def remap(n, start1, stop1, start2, stop2):
  return ((n-start1)/(stop1-start1))*(stop2-start2)+start2

def map_popularity_score(score, avg_score, min_score, max_score):
  score = float(score)
  pop_score = 0
  if (score < avg_score):
    pop_score = remap(score, min_score, avg_score, -1, 0)
  elif (score > avg_score):
    pop_score = remap(score, avg_score, max_score, 0, 1)
  return pop_score

In [51]:
# get post popularity scores
subreddit_index = pd.read_json('./subreddit-index-final.json')

sources = idf['subreddit']
num_upvotes = idf['post_num_upvotes']
pop_scores = []
ccdvs = []

i = 0

for score in num_upvotes:
    source = sources[i]
    source_data = subreddit_index[subreddit_index["name"] == source]
    upvote_data = list(source_data['upvote_data'].to_dict().values())[0]
    popularity_score = map_popularity_score(
        score,
        upvote_data['avg_upvotes'],
        upvote_data['min_upvotes'],
        upvote_data['max_upvotes']
    )
    ccdv = source_data['content_creator_diversity_ratio']
    pop_scores.append(popularity_score)
    ccdvs.append(ccdv)
    i += 1


In [59]:
idf['post_popularity_score'] = pop_scores

In [61]:
#get the creation date for each post
# reddit api
import praw
# initialize praw
reddit = praw.Reddit(
    client_id='hS4CPqNExizMF7XJ1XlMBQ',
    client_secret='zMcrLvT31UBI6V-zl2IFC9uR3MRA-g',
    user_agent='reddit-meme-analysis 0.1.1 by /u/inkoh',
    username='inkoh',
    password='Songoku777'
)

reddit.read_only = True

Version 7.3.0 of praw is outdated. Version 7.4.0 was released 2 days ago.


In [71]:
post_ids = idf['post_id']

def get_post_dates(sub_id):
  submission = reddit.submission(id=sub_id)
  return submission.created_utc

post_dates = list(map(get_post_dates, post_ids))



In [75]:
post_dates = list(map(lambda x: str(int(x)), post_dates))
idf['post_created_date'] = post_dates

In [79]:
image_dataset = idf.to_dict(orient="records")

for image_data in image_dataset:
  db.reference('/captious_dataset').push(image_data)

