# Using PyDrive2 to process g-drive files

In [70]:
!pip install PyDrive2

Collecting PyDrive2
  Downloading PyDrive2-1.10.1-py3-none-any.whl (39 kB)
Collecting pyOpenSSL>=19.1.0
  Downloading pyOpenSSL-22.0.0-py2.py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 KB[0m [31m509.2 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting cryptography>=35.0
  Downloading cryptography-37.0.2-cp36-abi3-macosx_10_10_x86_64.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Installing collected packages: cryptography, pyOpenSSL, PyDrive2
Successfully installed PyDrive2-1.10.1 cryptography-37.0.2 pyOpenSSL-22.0.0
You should consider upgrading via the '/Users/jeintron/.pyenv/versions/3.9.1/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m

## Authenticate

This authentication routine *should* just work, even if you don't have a "mcreds.txt" file.  If it doesn't, let me know!

In [71]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
 
gauth = GoogleAuth()
gauth.LoadCredentialsFile("mycreds.txt")
if gauth.credentials is None:
    gauth.GetFlow()
    gauth.flow.params.update({'access_type': 'offline'})
    gauth.flow.params.update({'approval_prompt': 'force'})
    
    # Authenticate if they're not there
    gauth.LocalWebserverAuth()
elif gauth.access_token_expired:
    # Refresh them if expired
    gauth.Refresh()
else:
    # Initialize the saved creds
    gauth.Authorize()
# Save the current credentials to a file
gauth.SaveCredentialsFile("mycreds.txt")
drive = GoogleDrive(gauth)

## Set up folder ids

These are pulled directly from the ids when I browse to the respective folders.  Simply a matter of convenience here.

In [72]:
#The folder we're looking for is 19H1uLY6PL6XcGu0E1fSXMxYWUSM4XFaL - (you can find this on the URL of the folder you want)
folders = {"CNN":'1Lz8u7wkDr4wYmSzsIKle0lQVWUvO1yim',
"CrooksAndLiars":'102aK5TvKde43bQ-lgS5bX1oVeh1lA6u3',
"NPR": "1ImjUNTH9tzQnQItdjQeb82HppngNx9Ir",
"Reason": "1CaeCbBCpaIsdHtmOa0OhJ-weDKjQqv-P",
"OANN":"1WFCXxukdjngqjQ9HzKVE3f11L2rF9EP4"}

non_en_folders = {
    "CNN": "1g_j5Si8Q2YSyJXAZIbhaInzrwVrM6B-I",
    "CrooksAndLiars": "1x9EKlchL2g4knb019KEndxOrzXnCR9C3",
    "NPR": "1F1-Kxz06AoMzDnB52bOvG-IizSi9oLMe",
    "Reason": "1WC7L_h-KUTsCtx0NDKwjj8pXPr5sDdFd",
    "OANN":"1L5pfNbh_UZIDQ__JetblEJFX_kfp6Uno"

}

driveId = '0AM-VeyaNeDrSUk9PVA'


# Gets the files from the folder on the shared drive
def get_files(folder):
    fileList = drive.ListFile({'q': f"'{folders[folder]}' in parents and trashed=false",'corpora':'drive','driveId':f'{driveId}',"includeItemsFromAllDrives":"true","supportsAllDrives":"true"}).GetList()
    return [f for f in fileList]

#get_files("CNN")  



## Working with gdrive files

In [None]:
# Upload a file from the local drive to designated folder id
# -- fname should be either a full path or refer to a file in the current working directory
def upload_file(fname,folder):
    file1 = drive.CreateFile({"mimeType": "text/csv", "parents": [{"id": folder}]})
    file1.SetContentFile(fname)
    file1.Upload(param={'supportsTeamDrives': True}) # Upload the file.


# Move an existing gdrive file to a new folder id; this function works by downloading the file locally, 
# uploading it to the new location, then deleting the local copy
# -- note that a "gdrive_file" is an object returned by the pydrive API
#    the "get_files" function above returns a set of these

def move_file(gdrive_file,folder_to):
    ofile = gdrive_file['title']
    gdrive_file.GetContentFile(ofile)
    file1 = drive.CreateFile({"mimeType": "text/csv", "parents": [{"id": folder_to}]})
    file1.SetContentFile(ofile)
    file1.Upload(param={'supportsTeamDrives': True}) # Upload the file.
    gdrive_file.Trash()
    os.remove(ofile)

## Processing JSON files

These functions should do everything you need to transform a json file into a nicely formed csv.

In [82]:
import math
import re
import pandas as pd
pat = re.compile("RT\s@(\S+):")


# Helper function to move the retweet up in the dictionary object
def lift_retweet(ref_t, text):
    #print(ref_t)
    if type(ref_t) is list:
        x = list(filter(lambda x: x['type']=="retweeted",ref_t))
        if x:
            result = {"id":x[0]["id"]}
            result.update(x[0]['expanded']['public_metrics'])
            result["created_at"] = x[0]['expanded']['created_at']
            result["author_id"] = x[0]['expanded']['author_id']
            m = pat.search(text)
            if not m:
                #print(f"Couldn't find handle in text {text}")
                result["author_handle"] = "<missing tweet text>"
            else:
                result["author_handle"]= m.group(1)
        
            return {f"rt_{k}":v for k,v in result.items()}
    return None


# Note that this function presumes that we've used the previous code to append
# a whole bunch of separate json responses together.  So the first thing is does
# is corrects the file so that it is well-formed json.
# After that is filters out the columns we want, and parses the "referenced_tweets" column 
# to extract retweet information
def process_file(f):
    with open(f) as fin:
        s = fin.read()
        if s[-2:] == ",]":
            s = s[0:-2]+"]"
        if s.find("]{")>-1:
            s= s.replace("]{",",{")
        if s[-1] != "]":
            print("Add terminal square bracket")
            s = s+"]"
        if s.find(",,") > -1:
            s = s.replace(",,",",")
    
    data = pd.read_json(s)
    cols = {'created_at','conversation_id','id','text','in_reply_to_user_id'}.intersection(set(data.columns))
    if "referenced_tweets" in data.columns:
        ndata = pd.DataFrame(data.apply(lambda x: lift_retweet(x.referenced_tweets,x.text),axis=1)).iloc[:,0].apply(pd.Series,dtype="object")
        data = pd.concat([data[list(cols)],ndata],axis=1)
    else:
        print("No retweets...")
        data = data[list(cols)]
    nfile = f"{f.split('.json')[0]}.csv"
    data.to_csv(nfile)
    return nfile





## Scripting

The following code calls the above functions to process json files into gdrive files.  Note that the old json files are still there, so you might want to fiddle with this to process just the new stuff.

In [56]:
# For remote files

for f in files:
    if f['title'].endswith(".json"):
        try:
            ofile = f['title']
            f.GetContentFile(ofile)
            nfile = process_file(ofile)
            upload_file(nfile)
            os.remove(ofile)
            os.remove(nfile)
        except Exception as e:
            print(f"Error processing {ofile}")
            print(e)
        print(".",end="")

.........................................................................................................................................................................................................

The following is for any local json files you might have

In [52]:
# For local files

import os
for f in os.listdir():
    if f.endswith("crooksandliars.json"):
        try:
            nfile = process_file(f)
            upload_file(nfile)
            os.remove(f)
            os.remove(nfile)
        except Exception as e:
            print(f"Error processing {f}")
            print(e)
        print(".",end="")


Add terminal square bracket
Error processing 182446931_crooksandliars.json
Unexpected character found when decoding array value (1)
.Add terminal square bracket
Error processing 72795851_crooksandliars.json
Unexpected character found when decoding array value (1)
.

## Vestigial 

The rest of the code is for reference - perhaps not mission critical, but still useful

In [None]:
%run "./topic_extraction.py"
folderId = '19H1uLY6PL6XcGu0E1fSXMxYWUSM4XFaL'
fileList = drive.ListFile({'q': "'19H1uLY6PL6XcGu0E1fSXMxYWUSM4XFaL' in parents and trashed=false",'corpora':'drive','driveId':'0AM-VeyaNeDrSUk9PVA',"includeItemsFromAllDrives":"true","supportsAllDrives":"true"}).GetList()
for file in fileList:
    with open("processing.log","a") as f:
       file.GetContentFile(file['title'])
       pipeline(f,"./") 
    

In [1]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 KB[0m [31m994.1 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.2-py2.py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.2-cp39-cp39-macosx_11_0_x86_64.whl size=348869 sha256=499a1c0858cced502844c80309df8742be4fee87e387579d2d4a29629da932c7
  Stored in directory: /Users/jeintron/Library/Caches/pip/wheels/64/57/bc/1741406019061d5664914b070bd3e71f6244648732bc96109e
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.9.2
You should consider upgrading via the '/Users/jeintron/.pyenv/versions/3.9.1/bin/python3.9 -m pip install --upgrade 

In [19]:
import fasttext
model = fasttext.load_model("../../../data/lid.176.bin")

def detect_en(x):
    try:
        return model.predict(x)[0][0]=="__label__en"
    except:
        return False



In [88]:
fileslist = get_files("NPR")

In [85]:
import pandas as pd
import re


def process(folder,sample_size=100):
    fileslist:list = get_files(folder)
    print(f"Retrieved {len(fileslist)} files")
    results = {}
    for f in fileslist:
        result = []
        if not re.match("[^.]+\.csv",f['title']):
            print(f"Skipping {f['title']}")
            continue
        f.GetContentFile(f['title'])
        data = pd.read_csv(f['title'],nrows = sample_size)
        if ("text" not in data.columns):
            print(f"{f['title']} is missing data")
            results[f['title']] = 0
        else:
            x = data.text.tolist()
            for line in x:
                line = re.sub("\n+"," ",line)
                result.append(detect_en(line))
            
            results[f['title']] = sum(result)/100
        os.remove(f['title'])
    return results

#cnn_results = process("CNN")

In [80]:
for f in fileslist:
    if f['title'] in cnn_results and cnn_results[f['title']] < .75:
        move_file(f,non_en_folders['CNN'])
        break

ApiRequestError: <HttpError 404 when requesting https://www.googleapis.com/drive/v2/files/1hhQpCkmGsykJL7cRqg394gamsOZz3W6B/trash?alt=json returned "File not found: 1hhQpCkmGsykJL7cRqg394gamsOZz3W6B". Details: "[{'message': 'File not found: 1hhQpCkmGsykJL7cRqg394gamsOZz3W6B', 'domain': 'global', 'reason': 'notFound', 'location': 'file', 'locationType': 'other'}]">

In [None]:
last = None
updated = {}
for k,v in cnn_results.items():
    if v == 0:
        updated[k] = v
        continue
    if last is None:
        last = v
    else:
        updated[k] = v - last
        last = v

updated
    


In [35]:
cnn_results = updated

In [None]:
cnn_results

In [77]:
flist = get_files("CNN")

In [83]:
for x in flist:
    if x['title'] in cnn_results and cnn_results[x['title']] < .75:
        move_file(x,non_en_folders['CNN'])

In [None]:
flist = get_files("NPR")
npr_results = process("NPR")



In [92]:
for x in flist:
    if x['title'] in npr_results and npr_results[x['title']] < .75:
        move_file(x,non_en_folders['NPR'])

In [93]:
folder = "CrooksAndLiars"
flist = get_files(folder)
results = process(folder)
for x in flist:
    if x['title'] in results and results[x['title']] < .75:
        move_file(x,non_en_folders[folder])


Retrieved 538 files
Skipping 18395302_crooksandliars.json
Skipping 43547109_crooksandliars.json
Skipping 46943796_crooksandliars.json
Skipping 32459094_crooksandliars.json
Skipping 271046940_crooksandliars.json
Skipping 52123042_crooksandliars.json
Skipping 405140226_crooksandliars.json
Skipping 179832906_crooksandliars.json
Skipping 60190565_crooksandliars.json
Skipping 15478103_crooksandliars.json
Skipping 188504261_crooksandliars.json
Skipping 40474578_crooksandliars.json
Skipping 49208827_crooksandliars.json
Skipping 40367314_crooksandliars.json
Skipping 296448192_crooksandliars.json
Skipping 184006847_crooksandliars.json
Skipping 25647023_crooksandliars.json
Skipping 16324153_crooksandliars.json
Skipping 16430347_crooksandliars.json
Skipping 191199892_crooksandliars.json
Skipping 337725206_crooksandliars.json
Skipping 66102323_crooksandliars.json
Skipping 53320063_crooksandliars.json
Skipping 66020426_crooksandliars.json
Skipping 36552378_crooksandliars.json
Skipping 81592894_croo

In [None]:
folder = "Reason"
flist = get_files(folder)
results = process(folder)
for x in flist:
    if x['title'] in results and results[x['title']] < .75:
        move_file(x,non_en_folders[folder])